In [1]:
import re
import string
import pickle
import unicodedata
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns; sns.set()

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import ngrams

In [2]:
## sklearn
import sklearn.svm as svm
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score

## Scipy
from scipy.sparse import csr_matrix

# Load Dataset

In [3]:
data_folder = '../Data/reddit/title/'
positive_file_names = ['anxiety', 'ask_reddit', 'depression', 'psychosis', 'stress', 'SuicideWatch']
negative_file_names = ['ask_reddit']
file_extension = '.txt'

# Load positive dataframe
pos_df = pd.read_csv(filepath_or_buffer=data_folder + positive_file_names[0] + file_extension, sep='❖', header =None, names =['text'])
pos_df['source'] = positive_file_names[0]
pos_df['label'] = 1

neg_df = pd.read_csv(filepath_or_buffer=data_folder + negative_file_names[0] + file_extension, sep='❖', header =None, names =['text'])
## Balance the positive and negative samples
neg_df = neg_df.sample(n=pos_df.shape[0], random_state=1, ignore_index=True)
neg_df['source'] = negative_file_names[0]
neg_df['label'] = -1

df = pd.concat([pos_df, neg_df], ignore_index=True)
df

  return func(*args, **kwargs)


Unnamed: 0,text,source,label
0,I'm worried and angry about a lot of things rn...,anxiety,1
1,I've never been afraid of anything,anxiety,1
2,I'm afraid I am getting agoraphobia,anxiety,1
3,I think I'm a Hypochondriac,anxiety,1
4,I start school tomorrow and I haven't done any...,anxiety,1
...,...,...,...
33995,What's the craziest slow-burning prank with mu...,ask_reddit,-1
33996,What's a weird thing others say that you do / ...,ask_reddit,-1
33997,How does VPN work? Can the authorities track y...,ask_reddit,-1
33998,[Serious] We've known about the risk of global...,ask_reddit,-1


# Text preprocessing
- Tokenizes
- Make text lowercase
- Removes hyperlinks
- Remove punctuation
- Removes numbers
- Removes useless words "stopwords"
- Stemming/Lemmatization

In [4]:
stop_words = stopwords.words('english')
stemmer    = nltk.SnowballStemmer("english")

In [5]:
def clean_text(text):
    '''
        Make text lowercase, remove text in square brackets,remove links,remove punctuation
        and remove words containing numbers.
    '''
    text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text) # remove urls
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text) # remove punctuation
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def preprocess_data(text):
    stop_words = stopwords.words('english')
    stemmer    = nltk.SnowballStemmer("english")
    text = clean_text(text)                                                     # Clean puntuation, urls, and so on
    text = ' '.join(word for word in text.split() if word not in stop_words)    # Remove stopwords
    text = ' '.join(stemmer.stem(word) for word in text.split())                # Stemm all the words in the sentence
    return text

In [6]:
df['clean_text'] = df.text.apply(preprocess_data)
df

Unnamed: 0,text,source,label,clean_text
0,I'm worried and angry about a lot of things rn...,anxiety,1,im worri angri lot thing rn dont know even sor...
1,I've never been afraid of anything,anxiety,1,ive never afraid anyth
2,I'm afraid I am getting agoraphobia,anxiety,1,im afraid get agoraphobia
3,I think I'm a Hypochondriac,anxiety,1,think im hypochondriac
4,I start school tomorrow and I haven't done any...,anxiety,1,start school tomorrow havent done holiday work...
...,...,...,...,...
33995,What's the craziest slow-burning prank with mu...,ask_reddit,-1,what craziest slowburn prank multipl twist bui...
33996,What's a weird thing others say that you do / ...,ask_reddit,-1,what weird thing other say done asleep
33997,How does VPN work? Can the authorities track y...,ask_reddit,-1,vpn work author track even your use vpn peopl ...
33998,[Serious] We've known about the risk of global...,ask_reddit,-1,weve known risk global pandem decad obvious ca...


# Top n-gram features

In [7]:
df['split'] = np.random.choice(["train", "val", "test"], size=df.shape[0], p=[.7, .15, .15])
x_train = df[df["split"] == "train"]
y_train = x_train["label"]
x_val = df[df["split"] == "val"]
y_val = x_val["label"]


In [8]:
## Training pipeline
tf_idf = Pipeline([
     ('tfidf', TfidfVectorizer(ngram_range=(1,3))),
     ("classifier", svm.LinearSVC(C=1.0, class_weight="balanced"))
 ])

tf_idf.fit(x_train["clean_text"], y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(ngram_range=(1, 3))),
                ('classifier', LinearSVC(class_weight='balanced'))])

In [9]:
## Confidence measure
f1_score(y_val, tf_idf.predict(x_val["clean_text"]))

0.9264305177111716

In [10]:
coefs = tf_idf.named_steps["classifier"].coef_
if type(coefs) == csr_matrix:
    coefs.toarray().tolist()[0]
else:
    coefs.tolist()
    
feature_names = tf_idf.named_steps["tfidf"].get_feature_names()
coefs_and_features = list(zip(coefs[0], feature_names))

In [11]:
sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:50]

[(10.20860672237495, 'anxieti'),
 (7.9422261446705305, 'im'),
 (5.581288096920053, 'anxious'),
 (4.380216970539506, 'ive'),
 (4.186657298060825, 'cant'),
 (3.0547206139498173, 'feel'),
 (2.8603748824279878, 'dont know'),
 (2.6750162896951637, 'worri'),
 (2.649703556594423, 'help'),
 (2.557416206385601, 'panic'),
 (2.4991289933701735, 'what happen'),
 (2.4846470349412817, 'what wrong'),
 (2.446857727208464, 'trigger'),
 (2.431324140146172, 'fear'),
 (2.382970761909734, 'ssris'),
 (2.2742412227785387, 'what experi'),
 (2.252713728622761, 'therapi'),
 (2.206300612603103, 'scare'),
 (2.1702891924757375, 'ill'),
 (2.0931698521452824, 'symptom'),
 (2.0286588946483306, 'feel like'),
 (1.9956232773895684, 'freak'),
 (1.9720864531080824, 'dae'),
 (1.956815776921675, 'medic'),
 (1.9513894777338177, 'id'),
 (1.9352102089035823, 'attack'),
 (1.8918571132722908, 'overthink'),
 (1.8405474227632943, 'panic attack'),
 (1.8381951534720304, 'dont'),
 (1.7615551577137134, 'med'),
 (1.7563641876641705, 's

# SVM Classifier

In [12]:
## Build features for clean_text
features = [x[1] for x in sorted(coefs_and_features, key=lambda x: x[0], reverse=True)[:5000]]
for feature in features:
    df[feature] = df.clean_text.str.contains(feature).map(int)
df.head()

  df[feature] = df.clean_text.str.contains(feature).map(int)


Unnamed: 0,text,source,label,clean_text,split,anxieti,im,anxious,ive,cant,...,death dont think,dont think life,think life worth,lay bed,bound cultur,bound cultur parent,cultur parent,cultur parent dont,peer,cringey event
0,0,anxiety,1,im worri angri lot thing rn dont know even sor...,train,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,anxiety,1,ive never afraid anyth,train,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,anxiety,1,im afraid get agoraphobia,train,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,anxiety,1,think im hypochondriac,train,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,anxiety,1,start school tomorrow havent done holiday work...,train,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
## Build train & test set 
X = df.drop(columns=['text', 'source', 'label', 'clean_text', 'split'])
Y = df.label
# X_train,X_test,Y_train,Y_test = train_test_split(X, Y, random_state=20)

In [14]:
## 5-fold cross validation
clf = svm.LinearSVC()
scores = cross_val_score(clf, X, Y, cv=5, scoring='f1')
scores.mean()

0.8709743239011238

In [15]:
## Save the model
with open(positive_file_names[0] + '.sav', 'wb') as sav:
    pickle.dump(clf, sav)
# loaded_model = pickle.load(open(positive_file_names[0] + '.sav', 'rb'))