### `tf-idf_vectors`
This jupter notebook contains vectorization of tweets using tf-idf values.

In [1]:
import csv
import math
import pandas as pd
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn import metrics

In [2]:
# read not-stemmed preprocessed data set
df_preprocessed = pd.read_csv('../data/nst_preprocessed_tweets.csv')
df_preprocessed.head(5)

Unnamed: 0.1,Unnamed: 0,vader_sentiment_label,vader_score,tweet,tweet_length,url_link,pos_emoji,neg_emoji,profanity_word
0,0,0,-0.2699,wow dad yesterday take stupi would depression ...,278,0,0,0,0
1,1,0,-0.5995,part really harmfult lot people went every gui...,274,0,0,0,0
2,2,1,0.3382,one ways got depression learning dance rain so...,208,0,0,0,0
3,3,0,-0.8643,see wan na one say ptsd depression andor anxie...,114,0,0,0,0
4,4,0,-0.8316,clinical depression palpable hopelessness gene...,78,0,0,0,0


In [3]:
corpus = df_preprocessed['tweet'].tolist()
labels = df_preprocessed['vader_sentiment_label'].tolist()
total_docs = len(corpus)
total_docs, len(labels)

(22830, 22830)

In [9]:
temp1 = df_preprocessed.loc[df_preprocessed['vader_sentiment_label'] == 1]
temp2 = df_preprocessed.loc[df_preprocessed['vader_sentiment_label'] == 0]

print(len(temp1), len(temp2), (len(temp1) + len(temp2)))

4377 18453 22830


### Drop words and compute tf-idf
The tf-idf score of a word computed by:

    df = (num. of docs. contains) / (total docs.)
    tf = num. of. occurance of the word / num. of words in doc.
    idf = log( (num. of docs. contains) / ((total docs.) + 1) )
    tf-idf = tf * idf
These computations are held in a hash map.\
At first, the words outside the 0.01 to 0.95 doc. frequency range were removed.\
This resulted in a sharp decrease in the number of unique words, (182).\
As a result, the words that appeared once or appeared in every doc. are removed.\
The final number of unique words is 28,314.

In [5]:
def compute_dfs(tweets, total_docs):
    df_dict = {}
    #print(total_docs)
    
    for tweet in tweets:
        tweet = tweet.split(' ')
        for word in set(tweet):
            try:
                df_dict[word] += 1
            except KeyError:
                df_dict[word] = 1
    """
    for key, val in df_dict.items():
        num_does_contain = val
        df_dict[key] = (num_does_contain / total_docs)
    """
    return df_dict

In [6]:
def clean_dfs(tweets_df_dict, total_docs):
    #print(total_docs)
    
    for key in list(tweets_df_dict.keys()):
        # this is the latest change in the funct., uncomment if it needed
        #if tweets_dict[key] <= 0.01 or tweets_dict[key] >= 0.95:
        if tweets_df_dict[key] == 1 or tweets_df_dict[key] == total_docs:
            del tweets_df_dict[key]

    return dict(sorted(tweets_df_dict.items()))

In [7]:
def compute_idfs(tweets_df_dict, total_docs):
    idf_dict = {}
    #print(total_docs)
    
    for key, val in tweets_df_dict.items():
        num_does_contain = val
        idf_dict[key] = math.log(total_docs / (float(num_does_contain) + 1))
    return idf_dict

In [8]:
def compute_tf_idfs(tweets, labels, tweets_idf_dict, total_docs):
    tf_idf_matrix = []
    vector_size = len(tweets_idf_dict)
    
    for tweet, label in zip(tweets, labels):
        tweet = tweet.lower()
        tweet_words = tweet.split(' ')
        tweet_vector = [0.0] * vector_size
        
        for word in tweet_words:
            try:
                inverse_doc_freq = tweets_idf_dict[word]
            except KeyError:
                continue
            term_freq = tweet_words.count(word) / float(len(tweet_words))
            index = list(tweets_idf_dict.keys()).index(word)
            tweet_vector[index] = term_freq * inverse_doc_freq
        tf_idf_matrix.append([tweet, tweet_vector, label])
    return tf_idf_matrix

In [9]:
# this contains 28,314 unique words
tweets_df_dict = compute_dfs(corpus, total_docs)
tweets_df_dict

{'drugs': 105,
 'yesterday': 56,
 'would': 2302,
 'moms': 25,
 'dad': 68,
 'never': 631,
 'supporti': 19,
 'need': 761,
 'family': 254,
 'sisters': 9,
 'way': 641,
 'take': 581,
 'wow': 119,
 'stupi': 92,
 'anymore': 127,
 'worst': 242,
 'stance': 5,
 'thing': 531,
 'great': 784,
 'depression': 22159,
 'though': 172,
 'similar': 46,
 'absolute': 31,
 'lot': 432,
 'understand': 280,
 'really': 982,
 'show': 610,
 'harmfult': 1,
 'public': 54,
 'people': 1746,
 'part': 623,
 'insight': 4,
 'supposed': 55,
 'horror': 18,
 'narrati': 17,
 'helpful': 49,
 'overall': 20,
 'topic': 34,
 'went': 215,
 'rw': 36,
 'mental': 874,
 'gui': 49,
 'wouldeline': 2,
 'illness': 267,
 'every': 400,
 'strength': 47,
 'uncoveringthenewu': 1,
 'dance': 30,
 'stronger': 37,
 'rain': 35,
 'source': 47,
 'learning': 52,
 'circumstances': 17,
 'got': 835,
 'one': 1158,
 'changeyourmindchangeyourcircumstances': 1,
 'ways': 133,
 'ptsd': 298,
 'say': 523,
 'andor': 42,
 'play': 96,
 'likee': 1,
 'wan': 205,
 'na'

In [10]:
# dict contains 182 unique words in the range of 0.95 and 0.01
# this contains 12,294 unique words, words that appeared once or in every document are removed
tweets_df_dict = clean_dfs(tweets_df_dict, total_docs)
tweets_df_dict

{'Description': 4,
 'Female': 4,
 'TM': 3,
 'aa': 19,
 'aaic': 6,
 'aap': 2,
 'aaron': 2,
 'aatherapist': 2,
 'ab': 14,
 'abandoned': 8,
 'abandoning': 3,
 'abandonment': 7,
 'abeg': 3,
 'aber': 4,
 'abet': 3,
 'abi': 4,
 'abilities': 6,
 'ability': 52,
 'abit': 2,
 'able': 151,
 'abnormal': 3,
 'aboard': 2,
 'abolished': 3,
 'abolishing': 3,
 'aboriginal': 4,
 'aborted': 2,
 'abortion': 9,
 'abortions': 5,
 'abound': 4,
 'abraham': 2,
 'abroad': 6,
 'abruptly': 3,
 'abs': 2,
 'abscbn': 2,
 'absence': 7,
 'absent': 6,
 'absolute': 31,
 'absolutely': 85,
 'absorb': 2,
 'absorbed': 4,
 'abstract': 2,
 'absurd': 9,
 'abt': 40,
 'abundance': 4,
 'abundant': 3,
 'abuse': 110,
 'abused': 19,
 'abusi': 17,
 'abusing': 3,
 'abutin': 2,
 'abyss': 10,
 'ac': 5,
 'aca': 3,
 'academic': 10,
 'academy': 4,
 'acc': 5,
 'accelerate': 2,
 'accelerates': 2,
 'accept': 36,
 'acceptable': 2,
 'acceptance': 29,
 'accepted': 19,
 'accepting': 12,
 'access': 26,
 'accessibility': 2,
 'accessible': 4,
 'acch

In [11]:
tweets_idf_dict = compute_idfs(tweets_df_dict, total_docs)
tweets_idf_dict

{'Description': 8.42639282708974,
 'Female': 8.42639282708974,
 'TM': 8.649536378403951,
 'aa': 7.04009846596985,
 'aaic': 8.089920590468529,
 'aap': 8.937218450855731,
 'aaron': 8.937218450855731,
 'aatherapist': 8.937218450855731,
 'ab': 7.3277805384216315,
 'abandoned': 7.838606162187622,
 'abandoning': 8.649536378403951,
 'abandonment': 7.956389197844005,
 'abeg': 8.649536378403951,
 'aber': 8.42639282708974,
 'abet': 8.649536378403951,
 'abi': 8.42639282708974,
 'abilities': 8.089920590468529,
 'ability': 6.06553882597172,
 'abit': 8.937218450855731,
 'able': 5.011950218677565,
 'abnormal': 8.649536378403951,
 'aboard': 8.937218450855731,
 'abolished': 8.649536378403951,
 'abolishing': 8.649536378403951,
 'aboriginal': 8.42639282708974,
 'aborted': 8.937218450855731,
 'abortion': 7.733245646529795,
 'abortions': 8.244071270295786,
 'abound': 8.42639282708974,
 'abraham': 8.937218450855731,
 'abroad': 8.089920590468529,
 'abruptly': 8.649536378403951,
 'abs': 8.937218450855731,
 'a

In [12]:
tf_idf_matrix = compute_tf_idfs(corpus, labels, tweets_idf_dict, total_docs)

In [13]:
tweet, vector, label = tf_idf_matrix[1][0], tf_idf_matrix[1][1], tf_idf_matrix[1][2] 
print(f"Tweet:\n{tweet}\n\ntf-idf vector:\n{vector}\n\nVector length:\n{len(vector)}\n\nLabel:\n{label}")

Tweet:
part really harmfult lot people went every gui wouldeline understand rw horror show supposed insight depression mental illness overall helpful public narrati topic

tf-idf vector:
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.

In [14]:
len(tf_idf_matrix)

22830

## Models
    The data is divided into a 75/25 ratio.
    Multinomial Naive Bayes
    SVM
    RandomForest

In [15]:
target_names = ['Depresive', 'Non-Depressive']
tweets = [row[0] for row in tf_idf_matrix]
vectors = [row[1] for row in tf_idf_matrix]
labels = [row[2] for row in tf_idf_matrix]

In [16]:
vectors_bayes, labels_bayes = vectors[:], labels[:] 
X_train, X_test, y_train, y_test = train_test_split(vectors_bayes, labels_bayes, test_size=0.25, random_state=42)

In [17]:
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)
X_pred = naive_bayes_model.predict(X_test)

In [18]:
# Multinomial Naive Bayes model
print(classification_report(y_test, X_pred, target_names=target_names))

                precision    recall  f1-score   support

     Depresive       0.83      0.95      0.89      4588
Non-Depressive       0.50      0.18      0.27      1120

      accuracy                           0.80      5708
     macro avg       0.66      0.57      0.58      5708
  weighted avg       0.76      0.80      0.76      5708



In [20]:
vectors_svm, labels_svm = vectors[:], labels[:]
X_train, X_test, y_train, y_test = train_test_split(vectors_svm, labels_svm, test_size=0.25, random_state=42)

In [None]:
svm_scv_model = SVC(kernel='linear')
svm_scv_model.fit(X_train, y_train)
X_pred = svm_scv_model.predict(X_test)