### Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/data_after_preprocessing.csv')

###### tf-count and topic model for tweets

In [2]:
# # The lemmatization does not work really well.
# import nltk
# nltk.download('wordnet')
# from nltk.stem.wordnet import WordNetLemmatizer
# df['tweet'] = df['tweet'].apply(lambda x:' '.join(WordNetLemmatizer().lemmatize(i) for i in x.split(' ')))

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Adding a list of stop words to the wordlist
my_stop_words = text.ENGLISH_STOP_WORDS.union(['suicidal','bc','did','didn','t','does','doesn','don','dont','doing','going','gonna','having','isn','ll','ve','wanna','want','wanted','wanting','wasn','went','yes','yeah'])

count_vectorizer = CountVectorizer(min_df=0.01, stop_words=my_stop_words)
x = count_vectorizer.fit_transform(df['tweet'])

n,m = x.shape
k = 20 # try for 20 topics
lda = LatentDirichletAllocation(n_components=k, random_state=2021)
xtr = lda.fit_transform(x)

# So xtr will be new features to replace 'tweet'
data_x = pd.DataFrame(xtr).add_prefix('topic_')

#### Take log for some features.

In [3]:
features_to_log = ['nlikes', 'nreplies', 'nretweets','join_time', 'tweets', 'following', 'followers', 'likes', 'media']
for feature in features_to_log:
    data_x[feature] = np.log1p(df[feature])

'day' take absolute values, tweet_length take min-max scale.

In [4]:
df['day'] = (df['day'] - 4).abs()
df['tweet_length'] = df['tweet_length'] / df['tweet_length'].max()

In [5]:
features_to_add = ['day','url','tweet_length', 'tweet_sentiment', 'bio_sentiment', 'first_person', 'second_person', 'third_person']
for feature in features_to_add:
    data_x[feature] = df[feature]

In [6]:
data_y = df['label']

### Modelling

In [7]:
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(data_x, data_y, test_size = 0.2, random_state = 2021)

#### Logistics Regression

In [8]:
from sklearn import linear_model, metrics
estimator = linear_model.LogisticRegression(class_weight='balanced',
                                            multi_class='ovr',
                                            C=0.01,
                                            max_iter=1000)

estimator.fit(x_train, y_train)
y_pred = estimator.predict(x_test)
report = """
The evaluation report of OVR is:
Confusion Matrix:
{}
Accuracy: {}
""".format(metrics.confusion_matrix(y_test, y_pred),
           metrics.accuracy_score(y_test, y_pred))
print(report)


The evaluation report of OVR is:
Confusion Matrix:
[[1022  181  254]
 [ 157  102  164]
 [  86   66  256]]
Accuracy: 0.6031468531468531



#### Naive Bayes

#### Stacking

#### Boosting

#### Stacking