In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense, Embedding, Dropout, InputLayer
from tensorflow.keras.optimizers import Adam
import tensorflow_hub as hub
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
df_healthy = pd.read_csv("depression_dataset_reddit_cleaned.csv")
df_addiction = pd.read_csv("addiction_pre_features_tfidf_256.csv")
df_schizophrenia = pd.read_csv("schizophrenia_pre_features_tfidf_256.csv")
df_lonely = pd.read_csv("lonely_pre_features_tfidf_256.csv")
df_depression = pd.read_csv("depression_pre_features_tfidf_256.csv")
df_autism = pd.read_csv("autism_pre_features_tfidf_256.csv")
df_anxiety = pd.read_csv("anxiety_pre_features_tfidf_256.csv")
df_alcoholism = pd.read_csv("alcoholism_pre_features_tfidf_256.csv")
df_adhd = pd.read_csv("adhd_pre_features_tfidf_256.csv")
df_suicidewatch = pd.read_csv("suicidewatch_pre_features_tfidf_256.csv")
df_socialanxiety = pd.read_csv("socialanxiety_pre_features_tfidf_256.csv")

In [3]:
df_healthy = df_healthy[df_healthy["is_depression"] == 0]
df_healthy.rename(columns = {'is_depression':'category', "clean_text" : "post"}, inplace = True)

df_healthy.replace(0, "healthy", inplace = True)
df_healthy.head()

Unnamed: 0,post,category
3831,switchfoot http twitpic com y zl awww that s a...,healthy
3832,is upset that he can t update his facebook by ...,healthy
3833,kenichan i dived many time for the ball manage...,healthy
3834,my whole body feel itchy and like it on fire,healthy
3835,nationwideclass no it s not behaving at all i ...,healthy


In [4]:
def create_category_df(df, category_name):
    return pd.DataFrame({'category': category_name, 'post': df['post']})

df_healthy_cat = create_category_df(df_healthy, 'healthy')
df_addiction_cat = create_category_df(df_addiction, 'addiction')
df_schizophrenia_cat = create_category_df(df_schizophrenia, 'schizophrenia')
df_lonely_cat = create_category_df(df_lonely, 'lonely')
df_depression_cat = create_category_df(df_depression, 'depression')
df_autism_cat = create_category_df(df_autism, 'autism')
df_anxiety_cat = create_category_df(df_anxiety, 'anxiety')
df_alcoholism_cat = create_category_df(df_alcoholism, 'alcoholism')
df_adhd_cat = create_category_df(df_adhd, 'adhd')
df_suicidewatch_cat = create_category_df(df_suicidewatch, 'suicidewatch')
df_socialanxiety_cat = create_category_df(df_socialanxiety, 'socialanxiety')

df_combined = pd.concat([df_healthy_cat, df_addiction_cat, df_schizophrenia_cat, df_lonely_cat, df_depression_cat, df_autism_cat, df_anxiety_cat, df_alcoholism_cat, df_adhd_cat, df_suicidewatch_cat, df_socialanxiety_cat])

df_combined.reset_index(drop=True, inplace=True)

print(df_combined.head())

  category                                               post
0  healthy  switchfoot http twitpic com y zl awww that s a...
1  healthy  is upset that he can t update his facebook by ...
2  healthy  kenichan i dived many time for the ball manage...
3  healthy       my whole body feel itchy and like it on fire
4  healthy  nationwideclass no it s not behaving at all i ...


In [21]:
samples_per_class = 3000

# Initialize an empty list to store samples from each class
balanced_dataframes = []

# Iterate over each unique label and sample the required number of instances
for label in df_combined['category'].unique():
    label_df = df_combined[df_combined['category'] == label].sample(n=samples_per_class, random_state=42)
    balanced_dataframes.append(label_df)

# Concatenate all sampled dataframes into one
df_combined = pd.concat(balanced_dataframes).reset_index(drop=True)

In [22]:
def preprocessing(text):
    text = str(text)
    
    # Remove retweet "RT" text
    text = re.sub(r'^RT[\s]+', '', text)
    
    # Remove HTML line breaks
    text = re.sub(r'<br />', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters and numbers
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    
    # Remove single quotes
    text = text.replace("\'", "")
    
    # Tokenize text
    tokens = text.split()
    
    # Get stopwords and initialize stemmer
    stopwords_english = stopwords.words("english")
    stemmer = PorterStemmer()
    
    cleaned_words = []

    for x in tokens:
        if x not in stopwords_english:
            stem_word = stemmer.stem(x)
            cleaned_words.append(stem_word)
    
    return ' '.join(cleaned_words)


In [23]:
df_combined["cleaned_post"] = df_combined["post"].apply(preprocessing)

In [24]:
from sklearn.model_selection import train_test_split

X = df_combined["cleaned_post"]
y = df_combined["category"]

train_x, test_x, train_y, test_y = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

counter = CountVectorizer()
train_x_cv = counter.fit_transform(train_x)
test_x_cv = counter.transform(test_x)

In [26]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(train_x_cv, train_y)

y_pred = model.predict(test_x_cv)
model.score(test_x_cv, test_y)

0.6781818181818182

In [27]:
from sklearn.metrics import classification_report

print(classification_report(y_pred, test_y))

               precision    recall  f1-score   support

    addiction       0.80      0.76      0.78       642
         adhd       0.80      0.71      0.75       658
   alcoholism       0.86      0.79      0.83       660
      anxiety       0.63      0.60      0.61       597
       autism       0.75      0.81      0.78       558
   depression       0.40      0.43      0.41       597
      healthy       0.41      1.00      0.58       226
       lonely       0.72      0.62      0.66       715
schizophrenia       0.74      0.82      0.78       552
socialanxiety       0.66      0.63      0.65       640
 suicidewatch       0.69      0.55      0.61       755

     accuracy                           0.68      6600
    macro avg       0.68      0.70      0.68      6600
 weighted avg       0.70      0.68      0.68      6600

