In [1]:
import matplotlib.pyplot as plt
import csv
from textblob import TextBlob
import pandas as pd
import sklearn
import sys
import re
import numpy as np

#Must include these two lines for the code to compile initially
import nltk
nltk.download('punkt')

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

[nltk_data] Downloading package punkt to /home/willmsc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df = pd.read_csv('mental_health.csv', sep=',', quoting=csv.QUOTE_NONE)
print(df)

                                                    text      label
0      dear american teens question dutch person hear...    Healthy
1      nothing look forward lifei dont many reasons k...  Unhealthy
2      music recommendations im looking expand playli...    Healthy
3      im done trying feel betterthe reason im still ...  Unhealthy
4      worried  year old girl subject domestic physic...  Unhealthy
...                                                  ...        ...
27972  posting everyday people stop caring  religion ...    Healthy
27973  okay definetly need hear guys opinion ive pret...    Healthy
27974  cant get dog think ill kill myselfthe last thi...  Unhealthy
27975  whats point princess bridei really think like ...  Unhealthy
27976  got nudes person might might know snapchat do ...    Healthy

[27977 rows x 2 columns]


In [3]:
def split_into_tokens(message):
    return TextBlob(message).words

In [4]:
bag_of_words =  CountVectorizer(analyzer=split_into_tokens).fit(df['text'])
print(len(bag_of_words.vocabulary_))
print(bag_of_words)

72566
CountVectorizer(analyzer=<function split_into_tokens at 0x7f45fcb23378>,
                binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)


In [5]:
def prepare_data(text_train, text_test, bow_transformer):
    """
    Preprocesses the training and testing text by transforming them to their TF-IDF representation.

    Args:
        text_train: A list of training text.
        text_test: A list of testing text.
        bow_transformer: A bag-of-words (BOW) transformer fitted to the training data.

    Returns:
        A tuple containing the training text's TF-IDF representation and the testing text's TF-IDF representation.
    """

    # Transform the training text using the bag-of-words (BOW) transformer
    train_text_bow = bow_transformer.transform(text_train)

    # Fit a Term Frequency-Inverse Document Frequency (TF-IDF) transformer to the training text's BOW representation
    train_tfidf_transformer = TfidfTransformer().fit(train_text_bow)

    # Transform the training text to their TF-IDF representation
    train_text_tfidf = train_tfidf_transformer.transform(train_text_bow)

    # Transform the testing text using the bag-of-words (BOW) transformer
    test_text_bow = bow_transformer.transform(text_test)
    
    # Fit a Term Frequency-Inverse Document Frequency (TF-IDF) transformer to the testing text's BOW representation
    test_tfidf_transformer = TfidfTransformer().fit(test_text_bow)

    # Transform the testing text to their TF-IDF representation
    test_text_tfidf = test_tfidf_transformer.transform(test_text_bow)

    # Return a tuple containing the training and testing text's TF-IDF representations
    return (train_text_tfidf, test_text_tfidf)

In [6]:
text_train, text_test, label_train, label_test = \
    train_test_split(df['text'], df['label'], test_size=0.2)

In [14]:
#This is the line in which the Naive Bayes model is created and fed the answers for the 'training' data
#In essence, it is memorizing all the answers to each of the elements of the training data so when you enter a piece of testing data, it will determine the most likely ("closest") element in the training data, and return that element's label
#So it's pretty simple and might not perform that well, but it gives us a baseline, and if we are able to beat it, we will know our model is better than strict memorization!
data = prepare_data(text_train, text_test,bag_of_words)

mental_health_classifier = MultinomialNB().fit(data[0], label_train)

(<22381x72566 sparse matrix of type '<class 'numpy.float64'>'
	with 1244869 stored elements in Compressed Sparse Row format>, <5596x72566 sparse matrix of type '<class 'numpy.float64'>'
	with 312950 stored elements in Compressed Sparse Row format>)


In [52]:
text = "Hello my day is great"
answer = "Healthy"
new_data = bag_of_words.transform(pd.DataFrame({"text":[text], "label":[answer]})["text"])

In [54]:
predictions = mental_health_classifier.predict(new_data)
print(predictions)

['Healthy']


In [9]:
#If you are curious what the actual values/labels are, you can print them out here
print(text_test)
print(label_test)

18291    its selfish thing anyone dothats stepmother sa...
10904    need helpi know anymore  years since ive talke...
13391    im giving horny life aight nerds aint chief ho...
18399    need advicei want kill girl manipulates left b...
19487    helpive trying get contact therapist little we...
                               ...                        
3779     cant wake upi suicidal lately super hard trip ...
27958    bees smol one bee swarm one giant bee heck one...
18267    im doneim done everything im tired living earl...
4619     suicidal thoughts getting worseif see post acc...
1687     dead luckyi cant help think died lucky theres ...
Name: text, Length: 5596, dtype: object
18291    Unhealthy
10904    Unhealthy
13391      Healthy
18399    Unhealthy
19487    Unhealthy
           ...    
3779     Unhealthy
27958      Healthy
18267    Unhealthy
4619     Unhealthy
1687     Unhealthy
Name: label, Length: 5596, dtype: object


In [10]:
print(predictions)

['Unhealthy' 'Unhealthy' 'Healthy' ... 'Unhealthy' 'Unhealthy' 'Unhealthy']


In [11]:
print('Accuracy', accuracy_score(label_test, predictions))
print('Recall', recall_score(label_test, predictions,average="binary", pos_label = "Healthy"))
print('Precision', precision_score(label_test, predictions,average="binary", pos_label = "Healthy"))

Accuracy 0.8422087205146533
Recall 0.7043048694424842
Precision 0.9779519843214111
