In [1]:
import numpy as np
import pandas as pd
import torch
import numpy as np
import pandas as pd
from datasets import Dataset
from datasets import load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments, AutoModelForSequenceClassification, Trainer

In [36]:
df_healthy = pd.read_csv("depression_dataset_reddit_cleaned.csv")
df_addiction = pd.read_csv("addiction_pre_features_tfidf_256.csv")
df_schizophrenia = pd.read_csv("schizophrenia_pre_features_tfidf_256.csv")
df_lonely = pd.read_csv("lonely_pre_features_tfidf_256.csv")
df_depression = pd.read_csv("depression_pre_features_tfidf_256.csv")
df_autism = pd.read_csv("autism_pre_features_tfidf_256.csv")
df_anxiety = pd.read_csv("anxiety_pre_features_tfidf_256.csv")
df_alcoholism = pd.read_csv("alcoholism_pre_features_tfidf_256.csv")
df_adhd = pd.read_csv("adhd_pre_features_tfidf_256.csv")
df_suicidewatch = pd.read_csv("suicidewatch_pre_features_tfidf_256.csv")
df_socialanxiety = pd.read_csv("socialanxiety_pre_features_tfidf_256.csv")

In [37]:
df_healthy = df_healthy[df_healthy["is_depression"] == 0]
df_healthy.rename(columns = {'is_depression':'category', "clean_text" : "post"}, inplace = True)

df_healthy.replace(0, "healthy", inplace = True)
df_healthy.head()

Unnamed: 0,post,category
3831,switchfoot http twitpic com y zl awww that s a...,healthy
3832,is upset that he can t update his facebook by ...,healthy
3833,kenichan i dived many time for the ball manage...,healthy
3834,my whole body feel itchy and like it on fire,healthy
3835,nationwideclass no it s not behaving at all i ...,healthy


In [38]:
def create_category_df(df, category_name):
    return pd.DataFrame({'category': category_name, 'post': df['post']})

# Creating category DataFrames
df_healthy_cat = create_category_df(df_healthy, 'healthy')
df_addiction_cat = create_category_df(df_addiction, 'addiction')
df_schizophrenia_cat = create_category_df(df_schizophrenia, 'schizophrenia')
df_lonely_cat = create_category_df(df_lonely, 'lonely')
df_depression_cat = create_category_df(df_depression, 'depression')
df_autism_cat = create_category_df(df_autism, 'autism')
df_anxiety_cat = create_category_df(df_anxiety, 'anxiety')
df_alcoholism_cat = create_category_df(df_alcoholism, 'alcoholism')
df_adhd_cat = create_category_df(df_adhd, 'adhd')
df_suicidewatch_cat = create_category_df(df_suicidewatch, 'suicidewatch')
df_socialanxiety_cat = create_category_df(df_socialanxiety, 'socialanxiety')

# Concatenating all category DataFrames into one
df_combined = pd.concat([df_healthy_cat, df_addiction_cat, df_schizophrenia_cat, df_lonely_cat, df_depression_cat, df_autism_cat, df_anxiety_cat, df_alcoholism_cat, df_adhd_cat, df_suicidewatch_cat, df_socialanxiety_cat])

# Resetting index for the combined DataFrame
df_combined.reset_index(drop=True, inplace=True)

# Display the combined DataFrame
print(df_combined.head())

  category                                               post
0  healthy  switchfoot http twitpic com y zl awww that s a...
1  healthy  is upset that he can t update his facebook by ...
2  healthy  kenichan i dived many time for the ball manage...
3  healthy       my whole body feel itchy and like it on fire
4  healthy  nationwideclass no it s not behaving at all i ...


In [39]:
df_combined.rename(columns = {'category':'label'}, inplace = True)

In [40]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
df_combined["label"] = label_encoder.fit_transform(df_combined["label"])

In [41]:
df_combined.head()

Unnamed: 0,label,post
0,6,switchfoot http twitpic com y zl awww that s a...
1,6,is upset that he can t update his facebook by ...
2,6,kenichan i dived many time for the ball manage...
3,6,my whole body feel itchy and like it on fire
4,6,nationwideclass no it s not behaving at all i ...


In [44]:
train = df_combined.sample(frac=0.85)
test = df_combined.drop(train.index)

In [45]:
train = Dataset.from_pandas(train, preserve_index=False)
test = Dataset.from_pandas(test, preserve_index=False)

print(train)
print(test)

Dataset({
    features: ['label', 'post'],
    num_rows: 103352
})
Dataset({
    features: ['label', 'post'],
    num_rows: 18238
})


In [10]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

def tokenize_function(example):
    return tokenizer(example["clean_text"], padding="max_length", truncation=True)

tokenized_train_dataset = train.map(tokenize_function, batched=True)
tokenized_test_dataset = test.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/5798 [00:00<?, ? examples/s]

Map:   0%|          | 0/1933 [00:00<?, ? examples/s]

In [1]:
training_args = TrainingArguments("test-trainer", evaluation_strategy="epoch")   # default arguments for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained('roberta-base', num_labels=2)  # overwriting MLM roberta-base for sequence binary classification

def compute_metrics(eval_preds):   # compute accuracy and f1-score
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(   # specifying trainer class
    model,
    training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()