In [15]:
import re
import string
import pickle
import unicodedata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

In [16]:
## sklearn
import sklearn.svm as svm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

## Scipy
from scipy.sparse import csr_matrix

In [17]:
top_grams = pd.DataFrame()
metric = pd.DataFrame(index=["Precision","Recall","Accuracy", "F1"])

# Load Dataset

In [28]:
data_folder = '../Data/reddit/title/'
positive_file_names = ['stress', 'psychosis', 'anxiety', 'depression', 'SuicideWatch']
negative_file_names = ['ask_reddit']
file_extension = '.txt'

# Load positive dataframe
pos_df = pd.read_csv(filepath_or_buffer=data_folder + positive_file_names[0] + file_extension, sep='❖', quotechar='⩐', header =None, names =['text'], error_bad_lines=False)
pos_df['source'] = positive_file_names[0]
pos_df['label'] = 1

neg_df = pd.read_csv(filepath_or_buffer=data_folder + negative_file_names[0] + file_extension, sep='❖', quotechar='⩐', header =None, names =['text'], error_bad_lines=False)
## Balance the positive and negative samples
neg_df = neg_df.sample(n=pos_df.shape[0], random_state=1)
neg_df['source'] = negative_file_names[0]
neg_df['label'] = 0

df = pd.concat([pos_df, neg_df], ignore_index=True)
df

  return func(*args, **kwargs)


  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,text,source,label
0,How do I get rid of bad thoughts thoughts of w...,stress,1
1,Worry,stress,1
2,Idk how I can describe it aside from it being ...,stress,1
3,I interviewed for a position that would change...,stress,1
4,New product that can help anxiety/stress? Prod...,stress,1
...,...,...,...
8511,Can a European get an American credit card?,ask_reddit,0
8512,What are some conclusions humans made about an...,ask_reddit,0
8513,[SERIOUS] What’s the most disturbing thing you...,ask_reddit,0
8514,[serious] when do you brush your teeth?,ask_reddit,0


# Text preprocessing
- Tokenizes
- Make text lowercase
- Removes hyperlinks
- Remove punctuation
- Removes numbers
- Removes useless words "stopwords"
- Stemming/Lemmatization

In [29]:
stop_words = stopwords.words('english')
stemmer    = nltk.SnowballStemmer("english")

In [30]:
def clean_text(text):
    '''
        Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.
    '''
    text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove urls
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def preprocess_data(text):
    stop_words = stopwords.words('english')
    stemmer    = nltk.SnowballStemmer("english")
    text = clean_text(text)                                                     # Clean puntuation, urls, and so on
    text = ' '.join(word for word in text.split() if word not in stop_words)    # Remove stopwords
    text = ' '.join(stemmer.stem(word) for word in text.split())                # Stemm all the words in the sentence
    return text

In [31]:
df['clean_text'] = df.text.apply(preprocess_data)
df

Unnamed: 0,text,source,label,clean_text
0,How do I get rid of bad thoughts thoughts of w...,stress,1,get rid bad thought thought worri peopl think ...
1,Worry,stress,1,worri
2,Idk how I can describe it aside from it being ...,stress,1,idk describ asid inform feel like got courtesi...
3,I interviewed for a position that would change...,stress,1,interview posit would chang life feel mani dif...
4,New product that can help anxiety/stress? Prod...,stress,1,new product help anxietystress product concept...
...,...,...,...,...
8511,Can a European get an American credit card?,ask_reddit,0,european get american credit card
8512,What are some conclusions humans made about an...,ask_reddit,0,conclus human made anim certain anim got compl...
8513,[SERIOUS] What’s the most disturbing thing you...,ask_reddit,0,what disturb thing youv wit
8514,[serious] when do you brush your teeth?,ask_reddit,0,brush teeth


# Top n-gram features

In [32]:
df['split'] = np.random.choice(["train", "val", "test"], size=df.shape[0], p=[.7, .15, .15])
x_train = df[df["split"] == "train"]
y_train = x_train["label"]
x_val = df[df["split"] == "val"]
y_val = x_val["label"]

In [33]:
## Training pipeline
tf_idf = Pipeline([
     ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
     ("classifier", svm.LinearSVC(C=1.0, class_weight="balanced"))
 ])

tf_idf.fit(x_train["clean_text"], y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
                ('classifier', LinearSVC(class_weight='balanced'))])

In [34]:
## Confidence measure
f1_score(y_val, tf_idf.predict(x_val["clean_text"]))

0.8833202819107282

In [35]:
coefs = tf_idf.named_steps["classifier"].coef_
if type(coefs) == csr_matrix:
    coefs.toarray().tolist()[0]
else:
    coefs.tolist()
    
feature_names = tf_idf.named_steps["tfidf"].get_feature_names()
coefs_and_features = list(zip(coefs[0], feature_names))

In [36]:
top_grams = pd.DataFrame()
top_grams["anxity"] = sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:20]
for x in top_grams["anxity"]:
    print(x[0], '\t',x[1])

8.5900657390351 	 stress
3.3798123029621068 	 im
2.2354184487075575 	 relax
2.1061811495190716 	 help
1.9464947869056883 	 anxieti
1.9177541433392646 	 calm
1.917550329897508 	 cant
1.9148900790670367 	 feel
1.6916868522532575 	 medit
1.615459216302344 	 health
1.5456801764434083 	 posit
1.5394039226498626 	 burnout
1.4808747400986733 	 heart
1.44782466362786 	 ive
1.4462379558993712 	 mental
1.3633081154808464 	 depress
1.3466256382239208 	 reddit work
1.3420296618715453 	 music
1.3295250372359524 	 symptom
1.3039327767059508 	 what wrong


# SVM Classifier

In [52]:
# %%timeit
# ## Build features for clean_text
# features = [x[1] for x in sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:100]]
# for feature in features:
#     df[feature] = df.clean_text.str.contains(feature).map(int)
# df.head()

In [13]:

## Build features for clean_text
feature_arrays = []
features = [x[1] for x in sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:5000]]

for feature in features:
    feature_arrays.append(df.clean_text.str.contains(feature).map(int).to_numpy(dtype=bool))

In [14]:
## Build features for clean_text
feature_arrays = []
features = [x[1] for x in sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:5000]]

for feature in features:
    feature_arrays.append(df.clean_text.str.contains(feature).map(int).values)

In [54]:
## Build train & test set 
# X = df.drop(columns=['text', 'source', 'label', 'clean_text', 'split'])
X = pd.DataFrame(np.stack(feature_arrays, axis=1), columns=features)
Y = df.label
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, random_state=20)

In [55]:
## 5-fold cross validation
clf = svm.LinearSVC()

cv_metrics = [cross_val_score(clf, X, Y, cv=5, scoring='precision').mean(),
              cross_val_score(clf, X, Y, cv=5, scoring='recall').mean(),
              cross_val_score(clf, X, Y, cv=5, scoring='accuracy').mean(),
              cross_val_score(clf, X, Y, cv=5, scoring='f1').mean()]
metric[positive_file_names[0] + "_CV"] = cv_metrics

In [56]:
clf.fit(X_train, Y_train)
print("SVM claasifier F1 score: {0}".format(f1_score(Y_test, clf.predict(X_test))))

test_metrics = [precision_score(Y_test, clf.predict(X_test)),
                recall_score(Y_test, clf.predict(X_test)),
                accuracy_score(Y_test, clf.predict(X_test)),
                f1_score(Y_test, clf.predict(X_test))]
metric[positive_file_names[0] + "_test"] = test_metrics

SVM claasifier F1 score: 0.869750283768445


In [57]:
## Save the model
with open(positive_file_names[0] + '.sav', 'wb') as sav:
    pickle.dump(clf, sav)
# loaded_model = pickle.load(open(positive_file_names[0] + '.sav', 'rb'))

In [58]:
metric

Unnamed: 0,psychosis_CV,psychosis_test,stress_CV,stress_test
Precision,0.942569,0.940184,0.943771,0.950472
Recall,0.810164,0.809134,0.761881,0.750466
Accuracy,0.880374,0.877059,0.858271,0.854392
F1,0.871249,0.86975,0.842403,0.83871
