In [2]:
import re
import string
import pickle
import unicodedata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

In [3]:
## sklearn
import sklearn.svm as svm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate

## Scipy
from scipy.sparse import csr_matrix

In [4]:
top_grams = pd.DataFrame()
metric = pd.DataFrame(index=["Precision","Recall","Accuracy", "F1"])

# Load Dataset

In [5]:
data_folder = '../Data/reddit/title/'
positive_file_names = ['depression', 'SuicideWatch', 'anxiety', 'ask_reddit', 'psychosis', 'stress']
negative_file_names = ['ask_reddit']
file_extension = '.txt'

# Load positive dataframe
pos_df = pd.read_csv(filepath_or_buffer=data_folder + positive_file_names[0] + file_extension, sep='❖', quotechar='⩐', header =None, names =['text'], error_bad_lines=False)
pos_df['source'] = positive_file_names[0]
pos_df['label'] = 1

neg_df = pd.read_csv(filepath_or_buffer=data_folder + negative_file_names[0] + file_extension, sep='❖', quotechar='⩐', header =None, names =['text'], error_bad_lines=False)
## Balance the positive and negative samples
neg_df = neg_df.sample(n=pos_df.shape[0], random_state=1)
neg_df['source'] = negative_file_names[0]
neg_df['label'] = 0

df = pd.concat([pos_df, neg_df], ignore_index=True)
df

  return func(*args, **kwargs)


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,text,source,label
0,Just my thoughts,depression,1
1,Why am I not happy although i’m getting some g...,depression,1
2,"I’m not ungrateful, but i’m not happy",depression,1
3,Sad music that will make you love yourself,depression,1
4,Everything I try to do I lose interest in imme...,depression,1
...,...,...,...
956313,Would you rather be stranded out in space or s...,ask_reddit,0
956314,What did you see on the internet recently that...,ask_reddit,0
956315,"If you were to die right now, what would be yo...",ask_reddit,0
956316,You can push two buttons. The first one will m...,ask_reddit,0


# Text preprocessing
- Tokenizes
- Make text lowercase
- Removes hyperlinks
- Remove punctuation
- Removes numbers
- Removes useless words "stopwords"
- Stemming/Lemmatization

In [6]:
stop_words = stopwords.words('english')
stemmer    = nltk.SnowballStemmer("english")

In [7]:
def clean_text(text):
    '''
        Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.
    '''
    text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove urls
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def preprocess_data(text):
    stop_words = stopwords.words('english')
    stemmer    = nltk.SnowballStemmer("english")
    text = clean_text(text)                                                     # Clean puntuation, urls, and so on
    text = ' '.join(word for word in text.split() if word not in stop_words)    # Remove stopwords
    text = ' '.join(stemmer.stem(word) for word in text.split())                # Stemm all the words in the sentence
    return text

In [8]:
df['clean_text'] = df.text.astype(str).apply(preprocess_data)
df

Unnamed: 0,text,source,label,clean_text
0,Just my thoughts,depression,1,thought
1,Why am I not happy although i’m getting some g...,depression,1,happi although im get good news nnmi brain ful...
2,"I’m not ungrateful, but i’m not happy",depression,1,im ungrat im happi
3,Sad music that will make you love yourself,depression,1,sad music make love
4,Everything I try to do I lose interest in imme...,depression,1,everyth tri lose interest immedi passion energ...
...,...,...,...,...
956313,Would you rather be stranded out in space or s...,ask_reddit,0,would rather strand space strand deep depth ocean
956314,What did you see on the internet recently that...,ask_reddit,0,see internet recent made smile
956315,"If you were to die right now, what would be yo...",ask_reddit,0,die right would last wish
956316,You can push two buttons. The first one will m...,ask_reddit,0,push two button first one make everyon believ ...


# Top n-gram features

In [9]:
df['split'] = np.random.choice(["train", "val", "test"], size=df.shape[0], p=[.7, .15, .15])
x_train = df[df["split"] == "train"]
y_train = x_train["label"]
x_val = df[df["split"] == "val"]
y_val = x_val["label"]


In [10]:
## Training pipeline
tf_idf = Pipeline([
     ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
     ("classifier", svm.LinearSVC(C=1.0, class_weight="balanced"))
 ])

tf_idf.fit(x_train["clean_text"], y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
                ('classifier', LinearSVC(class_weight='balanced'))])

In [11]:
## Confidence measure
f1_score(y_val, tf_idf.predict(x_val["clean_text"]))

0.9228514878988854

In [12]:
coefs = tf_idf.named_steps["classifier"].coef_
if type(coefs) == csr_matrix:
    coefs.toarray().tolist()[0]
else:
    coefs.tolist()
    
feature_names = tf_idf.named_steps["tfidf"].get_feature_names()
coefs_and_features = list(zip(coefs[0], feature_names))

In [13]:
top_grams = pd.DataFrame()
top_grams["anxity"] = sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:20]
for x in top_grams["anxity"]:
    print(x[0], '\t',x[1])

7.361898069467513 	 im
6.992189685751805 	 depress
6.970554594977517 	 nni
6.551149157993388 	 ive
4.967404285735691 	 antidepress
4.3769586603159105 	 ni
4.310062279064543 	 cant
3.973800273169233 	 med
3.971081797385684 	 suicid
3.9580380571956173 	 therapist
3.8149908331642397 	 dont
3.7674543040642976 	 vent
3.7617511504073415 	 guess
3.745890237811832 	 therapi
3.613743650685313 	 wellbutrin
3.468218900891058 	 mayb
3.3954130842431933 	 numb
3.3556458220573484 	 id
3.3256383637940656 	 ill
3.305248631879399 	 zoloft


In [14]:
features = [x[1] for x in sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:5000]]
vectorizer = CountVectorizer(vocabulary=features)

In [15]:
tmp = df.loc[:10]
X = vectorizer.fit_transform(tmp.clean_text)
print(X.toarray()) 

[[0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [2 0 0 ... 0 0 0]
 ...
 [3 2 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


# SVM Training

In [16]:
# %%timeit
# ## Build features for clean_text
# features = [x[1] for x in sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:100]]
# for feature in features:
#     df[feature] = df.clean_text.str.contains(feature).map(int)
# df.head()

In [17]:
## Build train & test set 
X = df.clean_text
Y = df.label
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, random_state=20)

In [18]:
## 5-fold cross validation
features = [x[1] for x in sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:5000]]
clf = Pipeline([
    ('countvectorizer', CountVectorizer(vocabulary=features)),
    ('classifier', svm.LinearSVC(C=1.0, class_weight="balanced"))
])

In [19]:

cv_metrics = cross_validate(clf, X, Y, cv=5, scoring=['precision', 'recall', 'accuracy', 'f1'])



In [20]:

metric[positive_file_names[0] + "_CV"] = [cv_metrics['test_precision'].mean(),
                                            cv_metrics['test_recall'].mean(),
                                            cv_metrics['test_accuracy'].mean(),
                                            cv_metrics['test_f1'].mean()]

In [21]:
clf.fit(X_train, Y_train)
print("SVM claasifier F1 score: {0}".format(f1_score(Y_test, clf.predict(X_test))))

test_metrics = [precision_score(Y_test, clf.predict(X_test)),
                recall_score(Y_test, clf.predict(X_test)),
                accuracy_score(Y_test, clf.predict(X_test)),
                f1_score(Y_test, clf.predict(X_test))]
metric[positive_file_names[0] + "_test"] = test_metrics



SVM claasifier F1 score: 0.8105141777462871


In [22]:
## Save the model
with open(positive_file_names[0] + '.sav', 'wb') as sav:
    pickle.dump(clf, sav)
# loaded_model = pickle.load(open(positive_file_names[0] + '.sav', 'rb'))

In [24]:
metric

Unnamed: 0,depression_CV,depression_test
Precision,0.909011,0.911097
Recall,0.728049,0.729931
Accuracy,0.827587,0.828856
F1,0.808514,0.810514
