### ```Tf-idf Vectors with SMOTE```
This notebook contains vectorization of tweets using tf-idf values. Since the dataset is imbalanced, the oversampling technique "SMOTE" is applied to balance the dataset

In [None]:
import csv
import math
import pandas as pd
import pickle
from collections import Counter
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from imblearn.over_sampling import SMOTE, ADASYN

In [None]:
# read the preprocessed data set
df_preprocessed = pd.read_csv('../data/preprocessed_tweets.csv')
df_preprocessed.head(5)

Unnamed: 0.1,Unnamed: 0,vader_sentiment_label,vader_score,tweet,tweet_length,url_link,pos_emoji,neg_emoji,profanity_word
0,0,0,-0.2699,wow dad yesterday take stupi would depression ...,278,0,0,0,0
1,1,0,-0.5995,part really harmfult lot people went every gui...,274,0,0,0,0
2,2,1,0.3382,one ways got depression learning dance rain so...,208,0,0,0,0
3,3,0,-0.8643,see wan na one say ptsd depression andor anxie...,114,0,0,0,0
4,4,0,-0.8316,clinical depression palpable hopelessness gene...,78,0,0,0,0


In [None]:
corpus = df_preprocessed['tweet'].tolist()
labels = df_preprocessed['vader_sentiment_label'].tolist()
total_docs = len(corpus)
total_docs, len(labels)

(22830, 22830)

### Drop words and compute tf-idf
The tf-idf score of a word computed by:

    df = (num. of docs. contains) / (total docs.)
    tf = num. of. occurance of the word / num. of words in doc.
    idf = log( (num. of docs. contains) / ((total docs.) + 1) )
    tf-idf = tf * idf
These computations are held in a hash map.\
At first, the words outside the 0.01 to 0.95 doc. frequency range were removed.\
This resulted in a sharp decrease in the number of unique words, (182).\
As a result, the words that appeared once or appeared in every doc. are removed.\
The final number of unique words is 28,314.

In [None]:
def compute_dfs(tweets, total_docs):
    df_dict = {}
    #print(total_docs)

    for tweet in tweets:
        tweet = tweet.split(' ')
        for word in set(tweet):
            try:
                df_dict[word] += 1
            except KeyError:
                df_dict[word] = 1
    """
    for key, val in df_dict.items():
        num_does_contain = val
        df_dict[key] = (num_does_contain / total_docs)
    """
    return df_dict

In [None]:
def clean_dfs(tweets_df_dict, total_docs):
    #print(total_docs)

    for key in list(tweets_df_dict.keys()):
        # this is the latest change in the funct., uncomment if it needed
        #if tweets_dict[key] <= 0.01 or tweets_dict[key] >= 0.95:
        if tweets_df_dict[key] == 1 or tweets_df_dict[key] == total_docs:
            del tweets_df_dict[key]

    return dict(sorted(tweets_df_dict.items()))

In [None]:
def compute_idfs(tweets_df_dict, total_docs):
    idf_dict = {}
    #print(total_docs)

    for key, val in tweets_df_dict.items():
        num_does_contain = val
        idf_dict[key] = math.log(total_docs / (float(num_does_contain) + 1))
    return idf_dict

In [None]:
def compute_tf_idfs(tweets, labels, tweets_idf_dict, total_docs):
    tf_idf_matrix = []
    vector_size = len(tweets_idf_dict)

    for tweet, label in zip(tweets, labels):
        tweet = tweet.lower()
        tweet_words = tweet.split(' ')
        tweet_vector = [0.0] * vector_size

        for word in tweet_words:
            try:
                inverse_doc_freq = tweets_idf_dict[word]
            except KeyError:
                continue
            term_freq = tweet_words.count(word) / float(len(tweet_words))
            index = list(tweets_idf_dict.keys()).index(word)
            tweet_vector[index] = term_freq * inverse_doc_freq
        tf_idf_matrix.append([tweet, tweet_vector, label])
    return tf_idf_matrix

In [None]:
# this contains 28,314 unique words
tweets_df_dict = compute_dfs(corpus, total_docs)
tweets_df_dict

{'absolute': 31,
 'take': 581,
 'stance': 5,
 'drugs': 105,
 'family': 254,
 'similar': 46,
 'way': 641,
 'moms': 25,
 'dad': 68,
 'depression': 22159,
 'yesterday': 56,
 'though': 172,
 'thing': 531,
 'sisters': 9,
 'need': 761,
 'would': 2302,
 'never': 631,
 'worst': 242,
 'supporti': 19,
 'anymore': 127,
 'great': 784,
 'stupi': 92,
 'wow': 119,
 'rw': 36,
 'narrati': 17,
 'helpful': 49,
 'public': 54,
 'really': 982,
 'understand': 280,
 'harmfult': 1,
 'overall': 20,
 'lot': 432,
 'every': 400,
 'topic': 34,
 'part': 623,
 'show': 610,
 'supposed': 55,
 'gui': 49,
 'illness': 267,
 'went': 215,
 'wouldeline': 2,
 'insight': 4,
 'people': 1746,
 'mental': 874,
 'horror': 18,
 'got': 835,
 'one': 1158,
 'ways': 133,
 'source': 47,
 'changeyourmindchangeyourcircumstances': 1,
 'circumstances': 17,
 'strength': 47,
 'rain': 35,
 'stronger': 37,
 'learning': 52,
 'dance': 30,
 'uncoveringthenewu': 1,
 'see': 664,
 'andor': 42,
 'likee': 1,
 'say': 523,
 'wan': 205,
 'ptsd': 298,
 'pla

In [None]:
# dict contains 182 unique words in the range of 0.95 and 0.01
# this contains 12,294 unique words, words that appeared once or in every document are removed
tweets_df_dict = clean_dfs(tweets_df_dict, total_docs)
tweets_df_dict

{'Description': 4,
 'Female': 4,
 'TM': 3,
 'aa': 19,
 'aaic': 6,
 'aap': 2,
 'aaron': 2,
 'aatherapist': 2,
 'ab': 14,
 'abandoned': 8,
 'abandoning': 3,
 'abandonment': 7,
 'abeg': 3,
 'aber': 4,
 'abet': 3,
 'abi': 4,
 'abilities': 6,
 'ability': 52,
 'abit': 2,
 'able': 151,
 'abnormal': 3,
 'aboard': 2,
 'abolished': 3,
 'abolishing': 3,
 'aboriginal': 4,
 'aborted': 2,
 'abortion': 9,
 'abortions': 5,
 'abound': 4,
 'abraham': 2,
 'abroad': 6,
 'abruptly': 3,
 'abs': 2,
 'abscbn': 2,
 'absence': 7,
 'absent': 6,
 'absolute': 31,
 'absolutely': 85,
 'absorb': 2,
 'absorbed': 4,
 'abstract': 2,
 'absurd': 9,
 'abt': 40,
 'abundance': 4,
 'abundant': 3,
 'abuse': 110,
 'abused': 19,
 'abusi': 17,
 'abusing': 3,
 'abutin': 2,
 'abyss': 10,
 'ac': 5,
 'aca': 3,
 'academic': 10,
 'academy': 4,
 'acc': 5,
 'accelerate': 2,
 'accelerates': 2,
 'accept': 36,
 'acceptable': 2,
 'acceptance': 29,
 'accepted': 19,
 'accepting': 12,
 'access': 26,
 'accessibility': 2,
 'accessible': 4,
 'acch

In [None]:
tweets_idf_dict = compute_idfs(tweets_df_dict, total_docs)
tweets_idf_dict

{'Description': 8.42639282708974,
 'Female': 8.42639282708974,
 'TM': 8.649536378403951,
 'aa': 7.04009846596985,
 'aaic': 8.089920590468529,
 'aap': 8.937218450855731,
 'aaron': 8.937218450855731,
 'aatherapist': 8.937218450855731,
 'ab': 7.3277805384216315,
 'abandoned': 7.838606162187622,
 'abandoning': 8.649536378403951,
 'abandonment': 7.956389197844005,
 'abeg': 8.649536378403951,
 'aber': 8.42639282708974,
 'abet': 8.649536378403951,
 'abi': 8.42639282708974,
 'abilities': 8.089920590468529,
 'ability': 6.06553882597172,
 'abit': 8.937218450855731,
 'able': 5.011950218677565,
 'abnormal': 8.649536378403951,
 'aboard': 8.937218450855731,
 'abolished': 8.649536378403951,
 'abolishing': 8.649536378403951,
 'aboriginal': 8.42639282708974,
 'aborted': 8.937218450855731,
 'abortion': 7.733245646529795,
 'abortions': 8.244071270295786,
 'abound': 8.42639282708974,
 'abraham': 8.937218450855731,
 'abroad': 8.089920590468529,
 'abruptly': 8.649536378403951,
 'abs': 8.937218450855731,
 'a

In [None]:
tf_idf_matrix = compute_tf_idfs(corpus, labels, tweets_idf_dict, total_docs)

In [None]:
tweet, vector, label = tf_idf_matrix[1][0], tf_idf_matrix[1][1], tf_idf_matrix[1][2]
print(f"Tweet:\n{tweet}\n\ntf-idf vector:\n{vector}\n\nVector length:\n{len(vector)}\n\nLabel:\n{label}")

Tweet:
part really harmfult lot people went every gui wouldeline understand rw horror show supposed insight depression mental illness overall helpful public narrati topic

tf-idf vector:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [None]:
len(tf_idf_matrix)

22830

## Models
    The data is divided into a 75/25 ratio.
    Multinomial Naive Bayes
    SVM
    RandomForest

In [None]:
target_names = ['Depresive', 'Non-Depressive']
tweets = [row[0] for row in tf_idf_matrix]
vectors = [row[1] for row in tf_idf_matrix]
labels = [row[2] for row in tf_idf_matrix]

In [None]:
vectors_bayes, labels_bayes = vectors[:], labels[:]
X_train, X_test, y_train, y_test = train_test_split(vectors_bayes, labels_bayes, test_size=0.25, random_state=42)

In [None]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
X_pred = naive_bayes_model.predict(X_test)

In [None]:
# Multinomial Naive Bayes model
print(classification_report(y_test, X_pred, target_names=target_names))

                precision    recall  f1-score   support

     Depresive       0.83      0.95      0.89      4588
Non-Depressive       0.50      0.18      0.27      1120

      accuracy                           0.80      5708
     macro avg       0.66      0.57      0.58      5708
  weighted avg       0.76      0.80      0.76      5708



In [None]:
vectors_svm, labels_svm = vectors[:], labels[:]
X_train, X_test, y_train, y_test = train_test_split(vectors_svm, labels_svm, test_size=0.25, random_state=42)

In [None]:
svm_scv_model = SVC(kernel='linear')
svm_scv_model.fit(X_train, y_train)
X_pred = svm_scv_model.predict(X_test)

In [None]:
# SVM model
print(classification_report(y_test, X_pred, target_names=target_names))

                precision    recall  f1-score   support

     Depresive       0.88      0.93      0.90      4588
Non-Depressive       0.60      0.46      0.52      1120

      accuracy                           0.83      5708
     macro avg       0.74      0.69      0.71      5708
  weighted avg       0.82      0.83      0.83      5708



In [None]:
vectors_random_forest, labels_random_forest = vectors[:], labels[:]
X_train, X_test, y_train, y_test = train_test_split(vectors_random_forest, labels_random_forest, test_size=0.25, random_state=42)

In [None]:
random_forest_model = RandomForestClassifier(class_weight='balanced', max_depth=5)
random_forest_model.fit(X_train, y_train)
X_pred = random_forest_model.predict(X_test)

In [None]:
# Random Forest model
print(classification_report(y_test, X_pred, target_names=target_names))

                precision    recall  f1-score   support

     Depresive       0.91      0.76      0.83      4588
Non-Depressive       0.42      0.69      0.52      1120

      accuracy                           0.75      5708
     macro avg       0.66      0.73      0.68      5708
  weighted avg       0.81      0.75      0.77      5708



In [None]:
X_cv, y_cv = vectors[:], labels[:]

In [None]:
scoring = ['accuracy', 'precision', 'recall', 'f1', 'f1_micro', 'f1_macro']
svm_cross_validation = SVC(kernel='rbf')
cv_results = cross_validate(svm_cross_validation, X_cv, y_cv, scoring=scoring, cv=10, verbose=3)

[CV] END  accuracy: (test=0.867) f1: (test=0.510) f1_macro: (test=0.716) f1_micro: (test=0.867) precision: (test=0.863) recall: (test=0.362) total time=48.8min
[CV] END  accuracy: (test=0.873) f1: (test=0.546) f1_macro: (test=0.736) f1_micro: (test=0.873) precision: (test=0.858) recall: (test=0.400) total time=47.3min
[CV] END  accuracy: (test=0.860) f1: (test=0.490) f1_macro: (test=0.705) f1_micro: (test=0.860) precision: (test=0.806) recall: (test=0.352) total time=48.9min
[CV] END  accuracy: (test=0.859) f1: (test=0.463) f1_macro: (test=0.691) f1_micro: (test=0.859) precision: (test=0.858) recall: (test=0.317) total time=44.3min
[CV] END  accuracy: (test=0.853) f1: (test=0.440) f1_macro: (test=0.678) f1_micro: (test=0.853) precision: (test=0.815) recall: (test=0.301) total time=44.8min
[CV] END  accuracy: (test=0.859) f1: (test=0.466) f1_macro: (test=0.692) f1_micro: (test=0.859) precision: (test=0.844) recall: (test=0.322) total time=43.9min
[CV] END  accuracy: (test=0.858) f1: (te

In [None]:
filename = '../models/cv_model.joblib'
pickle.dump(svm_cross_validation, open(filename, 'wb'))

### Over-sampling
    SMOTE and ADASYN techniques are used for synthethic data re-generation.

In [None]:
X, y = vectors, labels
Counter(labels)

Counter({0: 18453, 1: 4377})

In [None]:
smote_sampler = SMOTE(random_state=42)
X_res, y_res = smote_sampler.fit_resample(X, y)

In [None]:
Counter(y_res)

Counter({0: 18453, 1: 18453})

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.25, random_state=42)

In [None]:
svm_scv_model = SVC(kernel='linear')
svm_scv_model.fit(X_train, y_train)
X_pred = svm_scv_model.predict(X_test)

In [None]:
# Oversampling with SMOTE using SVM
print(classification_report(y_test, X_pred, target_names=target_names))

                precision    recall  f1-score   support

     Depresive       0.92      0.82      0.87      4563
Non-Depressive       0.84      0.93      0.88      4664

      accuracy                           0.88      9227
     macro avg       0.88      0.87      0.87      9227
  weighted avg       0.88      0.88      0.87      9227



In [None]:
X, y = vectors, labels
Counter(labels)

Counter({0: 18453, 1: 4377})

In [None]:
smote_sampler = SMOTE(random_state=42)
X_res, y_res = smote_sampler.fit_resample(X, y)

In [None]:
Counter(y_res)

Counter({0: 18453, 1: 18453})

In [None]:
# Oversampling with SMOTE using SVM in 5-fold cross-validation
scoring = ['accuracy', 'precision', 'recall', 'f1', 'f1_micro', 'f1_macro']
svm_cross_validation = SVC(kernel='linear')
cv_results = cross_validate(svm_cross_validation, X_res, y_res, scoring=scoring, cv=5, verbose=3)

[CV] END  accuracy: (test=0.823) f1: (test=0.819) f1_macro: (test=0.822) f1_micro: (test=0.823) precision: (test=0.835) recall: (test=0.803) total time=152.1min
[CV] END  accuracy: (test=0.878) f1: (test=0.886) f1_macro: (test=0.877) f1_micro: (test=0.878) precision: (test=0.831) recall: (test=0.949) total time=137.5min
[CV] END  accuracy: (test=0.888) f1: (test=0.896) f1_macro: (test=0.887) f1_micro: (test=0.888) precision: (test=0.834) recall: (test=0.969) total time=130.5min
[CV] END  accuracy: (test=0.883) f1: (test=0.892) f1_macro: (test=0.882) f1_micro: (test=0.883) precision: (test=0.824) recall: (test=0.972) total time=132.0min
[CV] END  accuracy: (test=0.881) f1: (test=0.891) f1_macro: (test=0.880) f1_micro: (test=0.881) precision: (test=0.823) recall: (test=0.970) total time=117.3min


In [None]:
filename = 'svm_os_cv_model.joblib'
pickle.dump(svm_cross_validation, open(filename, 'wb'))