## Import Relevant Libraries

In [None]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import string
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Import Dataset

In [None]:
# this cell downloads the depression dataset.... I couldn't figure out a more efficient method of downloading all data with one command
!wget https://zenodo.org/record/3941387/files/depression_2018_features_tfidf_256.csv?download=1 -O depression_2018.csv
#!wget https://zenodo.org/record/3941387/files/depression_2019_features_tfidf_256.csv?download=1 -O depression_2019.csv
#!wget https://zenodo.org/record/3941387/files/depression_post_features_tfidf_256.csv?download=1 -O depression_post.csv
#!wget https://zenodo.org/record/3941387/files/depression_pre_features_tfidf_256.csv?download=1 -O depression_pre.csv

--2022-05-21 05:42:58--  https://zenodo.org/record/3941387/files/depression_2018_features_tfidf_256.csv?download=1
Resolving zenodo.org (zenodo.org)... 137.138.76.77
Connecting to zenodo.org (zenodo.org)|137.138.76.77|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 74323253 (71M) [text/plain]
Saving to: ‘depression_2018.csv’


2022-05-21 05:43:03 (18.4 MB/s) - ‘depression_2018.csv’ saved [74323253/74323253]



In [None]:
!ls

depression_2018.csv  sample_data


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Datasets to Dataframes

In [None]:
depression_2018_df = pd.read_csv("depression_2018.csv")
#depression_2019_df = pd.read_csv("depression_2019.csv")
#depression_post_df = pd.read_csv("depression_post.csv")
#depression_pre_df = pd.read_csv("depression_pre.csv")


depression_2018_df.columns

Index(['subreddit', 'author', 'date', 'post', 'automated_readability_index',
       'coleman_liau_index', 'flesch_kincaid_grade_level',
       'flesch_reading_ease', 'gulpease_index', 'gunning_fog_index',
       ...
       'tfidf_wish', 'tfidf_without', 'tfidf_wonder', 'tfidf_work',
       'tfidf_worri', 'tfidf_wors', 'tfidf_would', 'tfidf_wrong',
       'tfidf_x200b', 'tfidf_year'],
      dtype='object', length=350)

In [None]:
depression

In [None]:
# List of column headers
#list(depression_2018_df.columns.values)

In [None]:
# Number of Authors (duplicates counted as individuals)
grouped_lists = depression_2018_df["author"].apply(list)
print(grouped_lists)

0                              [h, a, n, n, g, a, p, o, i]
1            [t, h, r, 0, w, 4, w, 4, y, 9, 8, 9, 8, 9, 7]
2                     [m, o, o, s, e, m, a, n, 1, 8, 0, 0]
3         [t, h, r, o, w, a, w, a, y, 1, 8, 0, 2, 3, 6, 0]
4                              [a, t, e, m, y, l, i, f, e]
                               ...                        
24530    [f, i, f, t, y, s, h, a, d, e, s, o, f, l, a, ...
24531                          [M, A, K, I, K, A, T, T, A]
24532                       [5, 6, 2, 0, 7, 0, 5, 8, 7, 0]
24533                       [m, x, q, u, e, e, n, 9, 7, 6]
24534                          [i, R, S, u, p, a, m, a, n]
Name: author, Length: 24535, dtype: object


In [None]:
# Number of Unique Authors (Duplicates disregarded)
print(depression_2018_df['author'].nunique())
print(len(depression_2018_df))

24535
24535


## Example of a post from the depression dataset

In [None]:
depression_post = depression_2018_df.loc[:, "post"][1]
depression_post

"I give up. Failed I just want to wipe my hands clean of the bare minimum responsibility for my existence. It's so fucking simple. It's very much laughable. Take my organs! Please! Lol It's more than a fair deal! Consider the alternative! It's so stupid. I'm stupid. Society's stupid. \n\nI don't endorse people taking their lives. This eyeblink of existence is all we have. It's literally a miracle each one of us is conscious. But for the same reason I shouldn't become waste through inaction. It's the best I can do."

In [None]:
# dataset of depression reddit posts

depression_posts = [i for i in depression_2018_df.loc[:, "post"]]

## Next step is to find the most common words in all posts related to depression

Some Questions and Planning:

* How should we preprocess the data? 
* data cleaning first into desired format, preferrably merging all 15 mental health support groups into the same dataset
* then doing a train / test split
* perform the word search


* we need to establish DSM-5 criteria
* decide how k-NN will be performed for classification

* k folds cross validation to find best k-NN setting

* as we learn SVM, perform cross validation on it and compare to k-NN classification performance

### Data Cleaning and Preprocessing

In [None]:
#functions to remove stopwords from posts

def remove_stops(text, stops):
  words = text.split()
  final = []
  for word in words:
    if word not in stops:
      final.append(word)
  final = " ".join(final)
  final = final.translate(str.maketrans("", "", string.punctuation))
  final = "".join([i for i in final if not i.isdigit()])
  while "  " in final:
    final = final.replace("  ", " ")
  return final

def clean_docs(docs):
  stops = stopwords.words("english")
  final = []
  final2 = []
  for doc in docs:
    clean_doc = remove_stops(doc, stops)
    final.append(clean_doc)

  #data = [line.strip() for line in open('/content/stop_words_english.txt', 'r')]
  #final2 = [[word.lower() for word in text.split()] for text in data]
  #final.append(final2)
  return final

In [None]:
cleaned_depression_docs = clean_docs(depression_posts)

### compare clean vs unclean

In [None]:
cleaned_depression_docs[1]

'I give up Failed I want wipe hands clean bare minimum responsibility existence Its fucking simple Its much laughable Take organs Please Lol Its fair deal Consider alternative Its stupid Im stupid Societys stupid I endorse people taking lives This eyeblink existence have Its literally miracle one us conscious But reason I become waste inaction Its best I do'

In [None]:
depression_posts[1]

"I give up. Failed I just want to wipe my hands clean of the bare minimum responsibility for my existence. It's so fucking simple. It's very much laughable. Take my organs! Please! Lol It's more than a fair deal! Consider the alternative! It's so stupid. I'm stupid. Society's stupid. \n\nI don't endorse people taking their lives. This eyeblink of existence is all we have. It's literally a miracle each one of us is conscious. But for the same reason I shouldn't become waste through inaction. It's the best I can do."

### TF IDF attempt

In [None]:

from sklearn.feature_extraction import text
with open('/content/stop_words_english.txt', 'r') as f:
    more_stop_words = [line.strip() for line in f]
my_stop_words = text.ENGLISH_STOP_WORDS.union(more_stop_words)

vectorizer = TfidfVectorizer(
                                lowercase=True,
                                max_features=300,
                                stop_words=my_stop_words)

vectors = vectorizer.fit_transform(cleaned_depression_docs)
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
depression_tfidf = pd.DataFrame(denselist, columns=feature_names)

  % sorted(inconsistent)


In [None]:
depression_tfidf

Unnamed: 0,absolutely,advice,afraid,age,alive,angry,antidepressants,anxiety,anxious,asked,...,working,worse,worst,worth,worthless,write,writing,wrong,year,years
0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.152159,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0000,0.0,0.000000,0.264361,0.114179
1,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0000,0.0,0.000000,0.000000,0.000000
2,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0000,0.0,0.000000,0.000000,0.000000
3,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0000,0.0,0.000000,0.000000,0.000000
4,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0000,0.0,0.000000,0.757310,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24530,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.0,0.0,0.0000,0.0,0.000000,0.000000,0.161897
24531,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.087108,0.0,0.0,...,0.0,0.086000,0.000000,0.0,0.0,0.1242,0.0,0.000000,0.000000,0.130731
24532,0.0,0.000000,0.0,0.0,0.413265,0.208784,0.0,0.000000,0.0,0.0,...,0.0,0.145713,0.188418,0.0,0.0,0.0000,0.0,0.000000,0.000000,0.000000
24533,0.0,0.107736,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.000000,0.116968,0.0,0.0,0.0000,0.0,0.102612,0.159185,0.137505


In [None]:
#visualize only the keywords

all_keywords = []

for description in denselist:
  x = 0
  keywords = []
  for word in description:
    if word > 0:
      keywords.append(feature_names[x])
    x = x+1
  all_keywords.append(keywords)

In [None]:
print(all_keywords[1])

['deal', 'fucking', 'literally', 'lives', 'people', 'reason', 'stupid']


In [None]:
## if we are interested in the n features with the highest TF IDF scores

top_n = 300
top_n_features = sorted(list(zip(feature_names, 
                                  vectors.sum(0).getA1())), 
                              key=lambda x: x[1], reverse=True)[:top_n]

In [None]:
#top_n_features

In [None]:
for feature in top_n_features:
  if feature[0] == 'suicide':
    print(feature)

('suicide', 398.4865341244319)


In [None]:
# Extract the TF-IDF seed words from the 2018 depression dataset
my_seed_words = []
for feature in top_n_features:
  my_seed_words.append(feature[0])
print(my_seed_words)

['feel', 'life', 'depression', 'ive', 'people', 'time', 'friends', 'don', 'day', 'years', 'depressed', 'feeling', 'good', 'work', 'talk', 've', 'school', 'happy', 'year', 'hate', 'job', 'bad', 'love', 'fucking', 'family', 'friend', 'person', 'shit', 'dont', 'days', 'hard', 'felt', 'thoughts', 'point', 'thought', 'long', 'months', 'today', 'lot', 'care', 'worse', 'live', 'anxiety', 'started', 'feels', 'parents', 'die', 'sad', 'told', 'tired', 'sleep', 'ill', 'lost', 'thinking', 'bed', 'wanted', 'college', 'fuck', 'start', 'week', 'kill', 'suicide', 'reason', 'pretty', 'mind', 'night', 'hope', 'times', 'living', 'suicidal', 'wrong', 'pain', 'left', 'place', 'post', 'head', 'advice', 'understand', 'talking', 'kind', 'weeks', 'relationship', 'mom', 'guess', 'social', 'mental', 'scared', 'making', 'high', 'lonely', 'finally', 'real', 'therapy', 'hours', 'normal', 'working', 'great', 'girl', 'money', 'problems', 'feelings', 'girlfriend', 'motivation', 'close', 'therapist', 'change', 'leave',

todo: go through all studies and create a comprehensive list of all keywords

In [None]:
# TF-IDF seed words from existing study
true_seed_words = ['myself', 'really', 'depression', 'hope', 'life', 'forever', 'pain', 'sad', 'live', 'mood']

In [None]:
# Calculate Cosine Similarity between the two seed word lists
from collections import Counter

# count word occurrences
our_vals = Counter(my_seed_words)
true_vals = Counter(true_seed_words)

# convert to word-vectors
words  = list(our_vals.keys() | true_vals.keys())
our_vect = [our_vals.get(word, 0) for word in words]        # [0, 0, 1, 1, 2, 1]
true_vect = [true_vals.get(word, 0) for word in words]        # [1, 1, 1, 0, 1, 0]

# find cosine
len_our  = sum(av*av for av in our_vect) ** 0.5             # sqrt(7)
len_true  = sum(bv*bv for bv in true_vect) ** 0.5             # sqrt(4)
dot    = sum(av*bv for av,bv in zip(our_vect, true_vect))    # 3
cosine = dot / (len_our * len_true)                          # 0.5669467


In [None]:
print(cosine)

0.12780193008453875
