Test

In [10]:
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas as pd
import sklearn
import sys
import re
import numpy as np

#Must include these two lines for the code to compile initially
import nltk
nltk.download('punkt')

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

[nltk_data] Downloading package punkt to
[nltk_data]     /home/bloomgardeni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
#Simply puts the data into a dataframe
df = pd.read_csv('mental_health.csv', sep=',', quoting=csv.QUOTE_NONE)
print(df)

                                                    text      label
0      dear american teens question dutch person hear...    Healthy
1      nothing look forward lifei dont many reasons k...  Unhealthy
2      music recommendations im looking expand playli...    Healthy
3      im done trying feel betterthe reason im still ...  Unhealthy
4      worried  year old girl subject domestic physic...  Unhealthy
...                                                  ...        ...
27972  posting everyday people stop caring  religion ...    Healthy
27973  okay definetly need hear guys opinion ive pret...    Healthy
27974  cant get dog think ill kill myselfthe last thi...  Unhealthy
27975  whats point princess bridei really think like ...  Unhealthy
27976  got nudes person might might know snapchat do ...    Healthy

[27977 rows x 2 columns]


In [12]:
#Simple tokenizer since the data is already lemmized!
def split_into_tokens(message):
    return TextBlob(message).words

In [16]:
#once you create the bag of words, you don't have to do it again, so I removed it from the prepare_data function
bag_of_words =  CountVectorizer(analyzer=split_into_tokens).fit(df['text'])
print(len(bag_of_words.vocabulary_))
print(bag_of_words)

72566
CountVectorizer(analyzer=<function split_into_tokens at 0x7fa9411bb130>)


In [17]:
#this is by far the most dense cell, I got chatgpt to comment it!
#I hope this makes it a little more readable, it's a bit different that what we did with out lab!
#Also don't worry, all the code is mine, only the comments are plagerized :)

def prepare_data(text_train, text_test, bow_transformer):
    """
    Preprocesses the training and testing text by transforming them to their TF-IDF representation.

    Args:
        text_train: A list of training text.
        text_test: A list of testing text.
        bow_transformer: A bag-of-words (BOW) transformer fitted to the training data.

    Returns:
        A tuple containing the training text's TF-IDF representation and the testing text's TF-IDF representation.
    """

    # Transform the training text using the bag-of-words (BOW) transformer
    train_text_bow = bow_transformer.transform(text_train)

    # Fit a Term Frequency-Inverse Document Frequency (TF-IDF) transformer to the training text's BOW representation
    train_tfidf_transformer = TfidfTransformer().fit(train_text_bow)

    # Transform the training text to their TF-IDF representation
    train_text_tfidf = train_tfidf_transformer.transform(train_text_bow)

    # Transform the testing text using the bag-of-words (BOW) transformer
    test_text_bow = bow_transformer.transform(text_test)

    # Fit a Term Frequency-Inverse Document Frequency (TF-IDF) transformer to the testing text's BOW representation
    test_tfidf_transformer = TfidfTransformer().fit(test_text_bow)

    # Transform the testing text to their TF-IDF representation
    test_text_tfidf = test_tfidf_transformer.transform(test_text_bow)

    # Return a tuple containing the training and testing text's TF-IDF representations
    return (train_text_tfidf, test_text_tfidf)


In [18]:
#fiddle with this to change the test-train split
text_train, text_test, label_train, label_test = \
    train_test_split(df['text'], df['label'], test_size=0.2)

In [19]:
#This is the line in which the Naive Bayes model is created and fed the answers for the 'training' data
#In essence, it is memorizing all the answers to each of the elements of the training data so when you enter a piece of testing data, it will determine the most likely ("closest") element in the training data, and return that element's label
#So it's pretty simple and might not perform that well, but it gives us a baseline, and if we are able to beat it, we will know our model is better than strict memorization!
data = prepare_data(text_train, text_test,bag_of_words)
mental_health_classifier = MultinomialNB().fit(data[0], label_train)

In [20]:
#This line simply generates the predicted label for all the elements of the training data
predictions = mental_health_classifier.predict(data[1])

In [24]:
#If you are curious what the actual values/labels are, you can print them out here
print(text_test)
print(label_test)

18988    tender beautifully crafted production delved d...
17925    compare movie industry ocean tendencies observ...
960      losing fighti feel worthless alone want burden...
11575    want future im tiredi want live long brain ada...
14483    millionaire happy family apparent reason depre...
                               ...                        
4665     ji trnka made last animated short indictment t...
8351     youre poor spend two hours finding coupon code...
14913    fear break broke meive thought redflag years d...
19977    know want kill myself nearly kill myselfi feel...
11978    im glad exist whoevers reading this thats want...
Name: text, Length: 5596, dtype: object
18988      Healthy
17925      Healthy
960      Unhealthy
11575    Unhealthy
14483    Unhealthy
           ...    
4665       Healthy
8351       Healthy
14913    Unhealthy
19977    Unhealthy
11978      Healthy
Name: label, Length: 5596, dtype: object


In [25]:
#And then you can manually compare them to the predictions
print(predictions)
#As you can see, it's kinda alright, but not perfect, but good enough!

['Healthy' 'Healthy' 'Unhealthy' ... 'Unhealthy' 'Unhealthy' 'Unhealthy']


In [26]:
#Since I also imported a bunch of accuracy functions, you can use them here to check the success rate!
print('Accuracy', accuracy_score(label_test, predictions))
print('Recall', recall_score(label_test, predictions,average="binary", pos_label = "Healthy"))
print('Precision', precision_score(label_test, predictions,average="binary", pos_label = "Healthy"))
#Feel free to add more if you feel like!
#Also, for our presentation, if you have time, you can graph how the data looks, it might be cool
#There are a lot of ways you can go about that, so I'll let you determine what you want to do!

Accuracy 0.8402430307362402
Recall 0.6979534227240649
Precision 0.9811507936507936
