### Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/data_after_preprocessing.csv')

###### tf-count and topic model for tweets

In [2]:
# # The lemmatization does not work really well.
# import nltk
# nltk.download('wordnet')
# from nltk.stem.wordnet import WordNetLemmatizer
# df['tweet'] = df['tweet'].apply(lambda x:' '.join(WordNetLemmatizer().lemmatize(i) for i in x.split(' ')))

from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Adding a list of stop words to the wordlist
my_stop_words = text.ENGLISH_STOP_WORDS.union(['suicidal','bc','did','didn','t','does','doesn','don','dont','doing','going','gonna','having','isn','ll','ve','wanna','want','wanted','wanting','wasn','went','yes','yeah'])

count_vectorizer = CountVectorizer(min_df=0.01, stop_words=my_stop_words)
x = count_vectorizer.fit_transform(df['tweet'])

n,m = x.shape
k = 20 # try for 20 topics
lda = LatentDirichletAllocation(n_components=k, random_state=2021)
xtr = lda.fit_transform(x)

# So xtr will be new features to replace 'tweet'
data_x = pd.DataFrame(xtr).add_prefix('topic_')

#### Take log for some features.

In [None]:
# features_to_log = ['nlikes', 'nreplies', 'nretweets','join_time', 'tweets', 'following', 'followers', 'likes', 'media']
# for feature in features_to_log:
#     data_x[feature] = np.log1p(df[feature])

In [3]:
features_to_log = ['nlikes', 'nreplies', 'nretweets','join_time']
for feature in features_to_log:
    data_x[feature] = np.log1p(df[feature])

'day' take absolute values, tweet_length take min-max scale.

In [4]:
df['day'] = (df['day'] - 4).abs()
df['tweet_length'] = df['tweet_length'] / df['tweet_length'].max()

In [5]:
features_to_add = ['day','url','tweet_length', 'tweet_sentiment', 'bio_sentiment', 'first_person', 'second_person', 'third_person','tweets_binning','following_binning','followers_binning','likes_binning','media_binning']
for feature in features_to_add:
    data_x[feature] = df[feature]

In [6]:
data_x.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,tweet_sentiment,bio_sentiment,first_person,second_person,third_person,tweets_binning,following_binning,followers_binning,likes_binning,media_binning
0,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,...,2,2,0,0,0,3.0,2.0,1.0,3.0,2.0
1,0.0125,0.0125,0.0125,0.0125,0.0125,0.0125,0.2625,0.0125,0.0125,0.0125,...,2,2,1,0,0,4.0,2.0,3.0,4.0,2.0
2,0.00625,0.131315,0.00625,0.131744,0.00625,0.00625,0.00625,0.50625,0.00625,0.00625,...,2,2,0,0,1,2.0,1.0,0.0,2.0,0.0
3,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,0.016667,...,0,2,1,0,0,3.0,2.0,2.0,4.0,2.0
4,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,...,2,2,0,0,0,4.0,3.0,3.0,2.0,1.0


In [7]:
data_y = df['label']

### Modelling

In [8]:
from sklearn import model_selection
x_train, x_test, y_train, y_test = model_selection.train_test_split(data_x, data_y, test_size = 0.2, random_state = 2021)

The label is cost-sensitive, i.e. it's more costly to wrongly classify 2 as 0 than classify 1 as 0.  
So we can assign some class weight to each class, adding the penalty for wrongly classifying label 2.

#### Logistics Regression

In [9]:
from sklearn import linear_model, metrics
estimator = linear_model.LogisticRegression(class_weight='balanced',
                                            multi_class='ovr',
                                            C=0.01,
                                            max_iter=1000)

estimator.fit(x_train, y_train)
y_pred = estimator.predict(x_test)
report = """
The evaluation report of OVR is:
Confusion Matrix:
{}
Accuracy: {}
""".format(metrics.confusion_matrix(y_test, y_pred),
           metrics.accuracy_score(y_test, y_pred))
print(report)


The evaluation report of OVR is:
Confusion Matrix:
[[1017  180  260]
 [ 154  110  159]
 [  83   61  264]]
Accuracy: 0.6079545454545454



#### Naive Bayes

In [18]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB(alpha=0.1)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
report_NB = """
The evaluation report of OVR is:
Confusion Matrix:
{}
Accuracy: {}
""".format(metrics.confusion_matrix(y_test, y_pred),
           metrics.accuracy_score(y_test, y_pred))
print(report_NB)


The evaluation report of OVR is:
Confusion Matrix:
[[1451    0    6]
 [ 404    0   19]
 [ 355    0   53]]
Accuracy: 0.6573426573426573



#### Stacking

#### Boosting

#### Stacking