In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/data_after_preprocessing.csv')

Things to take note before using this modelling notebook

1. There are 2 approaches to using the text corpus  
    a. LDA: Hard to explain because clusters are not labelled but dimensionality has been reduced to 5 (based on grid)  
    b. TFIDF: 173 (based on vectorizer) tfidf float numbers exist per tweet. Easier explanability but high dimensionality)  
2. A new feature "day_after" has been added. Remember to include it in the modelling step if you wish to.  
3. Remember to do scaling on numerical features.

### Data Preprocessing

#### Text Processing

In [2]:
# Lemmatization
import nltk
nltk.download('wordnet')

from nltk.stem.wordnet import WordNetLemmatizer
df['tweet'] = df['tweet'].apply(lambda x:' '.join(WordNetLemmatizer().lemmatize(i) for i in x.split(' ')))

# Remove punctuation
df['tweet'] = df['tweet'].str.replace('[^\w\s]','')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\flyxs\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  if __name__ == '__main__':


In [3]:
# Run tf vectorizer

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV

# Adding a list of stop words to the wordlist
my_stop_words = text.ENGLISH_STOP_WORDS.union(['suicidal'])

count_vectorizer = CountVectorizer(min_df=0.01, stop_words=my_stop_words)

In [4]:
# Use vectors for LDA

x = count_vectorizer.fit_transform(df['tweet'])
lda_model = LatentDirichletAllocation(n_components=5, learning_decay=0.7, random_state=2021)
xtr = lda_model.fit_transform(x)

# Log Likelyhood: Higher the better
print("Log Likelihood: ", lda_model.score(x))

# Perplexity: Lower the better. Perplexity = exp(-1. * log-likelihood per word)
print("Perplexity: ", lda_model.perplexity(x))

data_x = pd.DataFrame(xtr).add_prefix('topic_')

Log Likelihood:  -234533.78816095446
Perplexity:  146.79236506626245


In [5]:
from sklearn import preprocessing ,model_selection

# Take log for some features and generate data_x
features_to_log = ['nlikes', 'nreplies', 'nretweets', 'tweets', 'following', 'followers', 'likes', 'media']
for feature in features_to_log:
    data_x[feature] = np.log1p(df[feature])
    
# 'day' take absolute values.
df['day'] = (df['day'] - 4).abs()

# Adding features
features_to_add = ['day','reply_to', 'url','join_time', 'day_after','tweet_length', 'tweet_sentiment', 'bio_sentiment', 'first_person', 'second_person', 'third_person']
for feature in features_to_add:
    data_x[feature] = df[feature]

    
scaler = preprocessing.MinMaxScaler()
data_x.iloc[:, 5:] = scaler.fit_transform(data_x.iloc[:, 5:])
data_y = df['label']

In [6]:
# train test split
x_train, x_test, y_train, y_test = model_selection.train_test_split(data_x, data_y, test_size = 0.2, random_state = 2021)

### Modelling