In [1]:
import re
import string
import pickle
import unicodedata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

In [2]:
## sklearn
import sklearn.svm as svm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

## Scipy
from scipy.sparse import csr_matrix

In [3]:
top_grams = pd.DataFrame()
metric = pd.DataFrame(index=["Precision","Recall","Accuracy", "F1"])

# Load Dataset

In [11]:
data_folder = '../Data/reddit/title/'
positive_file_names = ['anxiety', 'ask_reddit', 'depression', 'psychosis', 'stress', 'SuicideWatch']
negative_file_names = ['ask_reddit']
file_extension = '.txt'

# Load positive dataframe
pos_df = pd.read_csv(filepath_or_buffer=data_folder + positive_file_names[0] + file_extension, sep='❖', quotechar='⩐', header =None, names =['text'], error_bad_lines=False)
pos_df['source'] = positive_file_names[0]
pos_df['label'] = 1

neg_df = pd.read_csv(filepath_or_buffer=data_folder + negative_file_names[0] + file_extension, sep='❖', quotechar='⩐', header =None, names =['text'], error_bad_lines=False)
## Balance the positive and negative samples
neg_df = neg_df.sample(n=pos_df.shape[0], random_state=1)
neg_df['source'] = negative_file_names[0]
neg_df['label'] = 0

df = pd.concat([pos_df, neg_df], ignore_index=True)
df

  import sys
Skipping line 94941: Expected 1 fields in line 94941, saw 5
Skipping line 94942: Expected 1 fields in line 94942, saw 3
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,text,source,label
0,"Paranoia is something I struggle with a lot, p...",anxiety,1
1,What is happening to me?,anxiety,1
2,I believe anxiety started this year and I real...,anxiety,1
3,How does one start therapy?,anxiety,1
4,I heard a a lot of people experience less anxi...,anxiety,1
...,...,...,...
426491,"What is your own definition of happiness, or w...",ask_reddit,0
426492,how much do you earn per month? state your age...,ask_reddit,0
426493,I went to a theme park and rode multiple rolle...,ask_reddit,0
426494,"People do Reddit, what’s the most questionable...",ask_reddit,0


# Text preprocessing
- Tokenizes
- Make text lowercase
- Removes hyperlinks
- Remove punctuation
- Removes numbers
- Removes useless words "stopwords"
- Stemming/Lemmatization

In [12]:
stop_words = stopwords.words('english')
stemmer    = nltk.SnowballStemmer("english")

In [13]:
def clean_text(text):
    '''
        Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.
    '''
    text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove urls
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def preprocess_data(text):
    stop_words = stopwords.words('english')
    stemmer    = nltk.SnowballStemmer("english")
    text = clean_text(text)                                                     # Clean puntuation, urls, and so on
    text = ' '.join(word for word in text.split() if word not in stop_words)    # Remove stopwords
    text = ' '.join(stemmer.stem(word) for word in text.split())                # Stemm all the words in the sentence
    return text

In [14]:
df['clean_text'] = df.text.apply(preprocess_data)
df

Unnamed: 0,text,source,label,clean_text
0,"Paranoia is something I struggle with a lot, p...",anxiety,1,paranoia someth struggl lot paranoia overthink...
1,What is happening to me?,anxiety,1,happen
2,I believe anxiety started this year and I real...,anxiety,1,believ anxieti start year realli dont anyon ta...
3,How does one start therapy?,anxiety,1,one start therapi
4,I heard a a lot of people experience less anxi...,anxiety,1,heard lot peopl experi less anxieti stress sto...
...,...,...,...,...
426491,"What is your own definition of happiness, or w...",ask_reddit,0,definit happi give happi life
426492,how much do you earn per month? state your age...,ask_reddit,0,much earn per month state age nation
426493,I went to a theme park and rode multiple rolle...,ask_reddit,0,went theme park rode multipl rollercoast went ...
426494,"People do Reddit, what’s the most questionable...",ask_reddit,0,peopl reddit what question thing that happen o...


# Top n-gram features

In [15]:
df['split'] = np.random.choice(["train", "val", "test"], size=df.shape[0], p=[.7, .15, .15])
x_train = df[df["split"] == "train"]
y_train = x_train["label"]
x_val = df[df["split"] == "val"]
y_val = x_val["label"]


In [16]:
## Training pipeline
tf_idf = Pipeline([
     ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
     ("classifier", svm.LinearSVC(C=1.0, class_weight="balanced"))
 ])

tf_idf.fit(x_train["clean_text"], y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
                ('classifier', LinearSVC(class_weight='balanced'))])

In [17]:
## Confidence measure
f1_score(y_val, tf_idf.predict(x_val["clean_text"]))

0.9320573564135041

In [18]:
coefs = tf_idf.named_steps["classifier"].coef_
if type(coefs) == csr_matrix:
    coefs.toarray().tolist()[0]
else:
    coefs.tolist()
    
feature_names = tf_idf.named_steps["tfidf"].get_feature_names()
coefs_and_features = list(zip(coefs[0], feature_names))

In [24]:
top_grams = pd.DataFrame()
top_grams["anxity"] = sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:50]
top_grams.head()

Unnamed: 0,anxity
0,"(14.301703991879622, anxieti)"
1,"(9.585010495490478, anxious)"
2,"(7.674457306810314, im)"
3,"(5.785025610697328, panic)"
4,"(5.763425014243243, ive)"


# SVM Classifier

In [35]:
# %%timeit
# ## Build features for clean_text
# features = [x[1] for x in sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:100]]
# for feature in features:
#     df[feature] = df.clean_text.str.contains(feature).map(int)
# df.head()

41.9 s ± 522 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [None]:
%%time
## Build features for clean_text
feature_arrays = []
features = [x[1] for x in sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:5000]]

for feature in features:
    feature_arrays.append(df.clean_text.str.contains(feature).map(int).values)
feature_df = pd.DataFrame(np.stack(feature_arrays, axis=1), columns=features)
pd.concat((df, feature_df), axis=1)
df.head()

In [None]:
%%time
## Build train & test set 
X = df.drop(columns=['text', 'source', 'label', 'clean_text', 'split'])
Y = df.label
X_train,X_test,Y_train,Y_test = train_test_split(X, Y, random_state=20)

In [None]:
%%time
## 5-fold cross validation
clf = svm.LinearSVC()

cv_metrics = [cross_val_score(clf, X, Y, cv=5, scoring='precision').mean(),
              cross_val_score(clf, X, Y, cv=5, scoring='recall').mean(),
              cross_val_score(clf, X, Y, cv=5, scoring='accuracy').mean(),
              cross_val_score(clf, X, Y, cv=5, scoring='f1').mean()]
metric[positive_file_names[0] + "_CV"] = cv_metrics

In [None]:
%%time
clf.fit(X_train, Y_train)
print("SVM claasifier F1 score: {0}".format(f1_score(Y_test, clf.predict(X_test))))

test_metrics = [precision_score(Y_test, clf.predict(X_test)),
                recall_score(Y_test, clf.predict(X_test)),
                accuracy_score(Y_test, clf.predict(X_test)),
                f1_score(Y_test, clf.predict(X_test))]
metric[positive_file_names[0] + "_test"] = cv_metrics

In [None]:
## Save the model
with open(positive_file_names[0] + '.sav', 'wb') as sav:
    pickle.dump(clf, sav)
# loaded_model = pickle.load(open(positive_file_names[0] + '.sav', 'rb'))