In [98]:
import pandas as pd
import numpy as np
import evaluate
df = pd.read_csv('data/data_after_preprocessing.csv')

Things to take note before using this modelling notebook

1. There are 2 approaches to using the text corpus  
    a. LDA: Hard to explain because clusters are not labelled but dimensionality has been reduced to 5 (based on grid)  
    b. TFIDF: 173 (based on vectorizer) tfidf float numbers exist per tweet. Easier explanability but high dimensionality)  
2. A new feature "day_after" has been added. Remember to include it in the modelling step if you wish to.  
3. Remember to do scaling on numerical features.

### Data Preprocessing

#### Text Processing

In [99]:
# Lemmatization
import nltk
nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer
df['tweet'] = df['tweet'].apply(lambda x:' '.join(WordNetLemmatizer().lemmatize(i) for i in x.split(' ')))

# Remove punctuation
df['tweet'] = df['tweet'].str.replace('[^\w\s]','')

[nltk_data] Downloading package wordnet to /Users/jingxue/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [100]:
# Run tf vectorizer

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

# Adding a list of stop words to the wordlist
my_stop_words = text.ENGLISH_STOP_WORDS.union(['suicidal'])

count_vectorizer = CountVectorizer(min_df=0.01, stop_words=my_stop_words)

# Use vectors for LDA

x = count_vectorizer.fit_transform(df['tweet'])
lda_model = LatentDirichletAllocation(n_components=5, learning_decay=0.7, random_state=2021)
xtr = lda_model.fit_transform(x)

# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(x))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(x))

data_x = pd.DataFrame(xtr).add_prefix('topic_')

Log Likelihood:  -234533.78816095451
Perplexity:  146.7923650662626


In [101]:
pd.DataFrame(count_vectorizer.vocabulary_.items()).sort_values(1,ascending=False)[:10]

Unnamed: 0,0,1
112,youre,175
58,young,174
77,yes,173
15,year,172
162,yeah,171
66,wrong,170
109,worse,169
36,world,168
59,work,167
71,wont,166


In [102]:
from sklearn import preprocessing, model_selection

#binning
list_for_binning1 = ['nlikes', 'nreplies', 'nretweets', 'tweets', 'following', 'followers', 'likes', 'media']
new_column1 = ['nlikes_binning','nreplies_binning','nretweets_binning','tweets_binning','following_binning','followers_binning','likes_binning','media_binning']
for i in range(8):
    new_col = new_column1[i]
    col_for_binning = list_for_binning1[i]
    data_x[new_col] = df[col_for_binning].apply(lambda x:0 if x==0 else np.floor(np.log10(x)))
    
list_for_binning2 = ['reply_to','join_time','day_after','tweet_length']
new_column2 = ['reply_to_binning','join_time_binning','day_after_binning','tweet_length_binning']
for i in range(4):
    new_col = new_column2[i]
    col_for_binning = list_for_binning2[i]
    data_x[new_col] = df[col_for_binning].apply(lambda x:0 if x==0 else np.floor_divide(x,10))

list_for_binning3 = ['topic_0','topic_1','topic_2','topic_3','topic_4']
# new_column2 = ['topic_0_binning','topic_1_binning','topic_2_binning','topic_3_binning','topic_4_binning']
for i in range (5):
#     new_col = new_column3[i]
    col_for_binning = list_for_binning3[i]
    data_x[col_for_binning] = data_x[col_for_binning].apply(lambda x:0 if x==0 else np.floor_divide(100*x,10))
    
# 'day' take absolute values.
df['day'] = (df['day'] - 4).abs()

# Adding features
features_to_add = ['day','url','tweet_sentiment', 'bio_sentiment', 'first_person', 'second_person', 'third_person']
for feature in features_to_add:
    data_x[feature] = df[feature]
    
# scaler = preprocessing.MinMaxScaler()
# data_x.iloc[:, 5:] = scaler.fit_transform(data_x.iloc[:, 5:])
data_y = df['label']

In [105]:
# plt.figure(figsize=(25,25))
# lst = ['day','reply_to', 'url', 'day_after','tweet_length', 'tweet_sentiment']
# for i in range(7):
#     plt.subplot(3,3,i+1)
#     plt.hist(df[lst[i]],bins=10);
#     plt.xlabel(lst[i])
#     plt.ylabel('counts')

In [106]:
# train test split
x_train, x_test, y_train, y_test = model_selection.train_test_split(data_x, data_y, test_size = 0.2, random_state = 2021)

### Modelling with all features

##### Naive Bayes

In [108]:
import matplotlib.pyplot as plt
from sklearn import naive_bayes

In [113]:
estimator = naive_bayes.MultinomialNB(alpha = 1)
estimator.fit(x_train, y_train)
y_pred_proba = estimator.predict_proba(x_test)
dic = evaluate.threshold(y_pred_proba,y_test)
print('Selected threshold: ', dic['threshold'])
print(evaluate.performance(y_test, dic['y_pred'],y_pred_proba)['report'])

Selected threshold:  0.03

The evaluation report of classification is:
Confusion Matrix:
[[675 771]
 [ 42 360]]
Accuracy: 0.560064935064935
Precision: 0.3183023872679045
Recall: 0.8955223880597015
F2 Score: 0.6571741511500548
AUC Score: 0.7770672914817339



### Modelling with real time available features
data in this part will have no features in 'nlikes', 'nreplies', 'nretweets' and 'day_after' becasue these features are received a few days after the tweets were posted.

In [115]:
x_train_rt = x_train.drop(columns=['nlikes_binning', 'nreplies_binning', 'nretweets_binning', 'day_after_binning'])
x_test_rt = x_test.drop(columns=['nlikes_binning', 'nreplies_binning', 'nretweets_binning', 'day_after_binning'])

In [118]:
# Model implementation
estimator = naive_bayes.MultinomialNB(alpha = 1)
estimator.fit(x_train, y_train)
y_pred_proba = estimator.predict_proba(x_test)
dic = evaluate.threshold(y_pred_proba,y_test)
print('Selected threshold: ', dic['threshold'])
print(evaluate.performance(y_test, dic['y_pred'],y_pred_proba)['report'])

Selected threshold:  0.03

The evaluation report of classification is:
Confusion Matrix:
[[675 771]
 [ 42 360]]
Accuracy: 0.560064935064935
Precision: 0.3183023872679045
Recall: 0.8955223880597015
F2 Score: 0.6571741511500548
AUC Score: 0.7770672914817339



### Modelling using only tweet
Other than using a LSTM to implement the classifier solely on tweet, we also try a mechine learning approach on tweet.  
Using the count_vectorizer, we perform a naive bayes to classify.  

In [12]:
x_train_2, x_test_2, y_train_2, y_test_2 = model_selection.train_test_split(df['tweet'], df['label'], test_size = 0.2, random_state = 2021)
count = CountVectorizer(min_df=0.01, stop_words=my_stop_words)
x_train_2 = count.fit_transform(x_train_2)
x_test_2 = count.transform(x_test_2)

from sklearn import naive_bayes

mnb = naive_bayes.MultinomialNB()
mnb.fit(x_train_2, y_train_2)
y_pred_proba = mnb.predict_proba(x_test_2)
dic = evaluate.threshold(y_pred_proba,y_test)
print('Selected threshold: ', dic['threshold'])
print(evaluate.performance(y_test, dic['y_pred'],y_pred_proba)['report'])

Selected threshold:  0.14

The evaluation report of classification is:
Confusion Matrix:
[[692 754]
 [ 36 366]]
Accuracy: 0.5725108225108225
Precision: 0.3267857142857143
Recall: 0.9104477611940298
F2 Score: 0.6708211143695014
AUC Score: 0.7933732100218134

