In [24]:
import pandas as pd

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer

In [36]:
# load in data
train_df = pd.read_csv('/home/j/Documents/Projects/social-media-combat-detection/data/train.csv')

train_df = train_df[(train_df['annotation'] == '1') | (train_df['annotation'] == '0')]

train_df.head()

Unnamed: 0,idx,msg_txt,annotation
1,910,:face_with_symbols_on_mouth:Occupants plan t...,0
2,2063,"Ukraine is preparing for a counterattack, De...",1
3,1018,Information on the state of restoration of c...,0
4,2378,White House statement Russia will not succee...,0
6,1776,:enraged_face: Enemy's TG channels are sprea...,0


In [44]:
test_df = pd.read_csv('/home/j/Documents/Projects/social-media-combat-detection/data/test.csv')

test_df = test_df[(test_df['annotation'] == '1') | (test_df['annotation'] == '0')]

test_df.head()

Unnamed: 0,idx,msg_txt,annotation
0,2983,DIU Russians modernized X-22 and Oniks missi...,0
1,1612,Wagner currently controls 4.8 thousand km2 o...,0
2,35,At the positions of the artillery of the 32n...,0
4,2332,We receive reports that there are power outa...,0
5,330,The law on electronic subpoenas equates Russ...,0


First, I'll use get a TFID matrix for the messages.

In [45]:
vectorizer = TfidfVectorizer(sublinear_tf=True)

freq_matrix_train = vectorizer.fit_transform(train_df['msg_txt'])
freq_matrix_test = vectorizer.transform(test_df['msg_txt'])

In [48]:
search = RandomizedSearchCV(XGBClassifier(), {'n_estimators': [int(x) for x in range(200,2000,200)],
        'min_child_weight': range(1,6,2),
        'gamma': [i/10.0 for i in range(0,5)],
        'subsample': [i/10.0 for i in range(6,10)],
        'colsample_bytree': [i/10.0 for i in range(6,10)],
        'max_depth': [3, 6, 9],
        'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
        })


In [49]:
search.fit(freq_matrix_train, train_df['annotation'].astype(int))

best = search.best_params_

In [50]:
print(best)

{'subsample': 0.9, 'n_estimators': 1600, 'min_child_weight': 3, 'max_depth': 9, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.6}


In [51]:
classifier = XGBClassifier(subsample= 0.9, n_estimators= 1600, min_child_weight= 3, max_depth= 9, learning_rate= 0.01, gamma= 0.2, colsample_bytree= 0.6)

classifier.fit(freq_matrix_train, train_df['annotation'].astype(int))

y_pred = classifier.predict(freq_matrix_test)

print(classification_report(test_df['annotation'].astype(int),y_pred))

              precision    recall  f1-score   support

           0       0.89      0.96      0.93       376
           1       0.85      0.66      0.74       127

    accuracy                           0.88       503
   macro avg       0.87      0.81      0.83       503
weighted avg       0.88      0.88      0.88       503



In [52]:
classifier.save_model('../models/xgb_classifier')



In [53]:
import pickle
with open('../models/tfid-vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [54]:
import pickle
classifier = XGBClassifier()
classifier.load_model('/home/j/Documents/Projects/social-media-combat-detection/models/xgb_classifier')
with open('/home/j/Documents/Projects/social-media-combat-detection/models/tfid-vectorizer.pickle', 'rb') as handle:
        vectorizer = pickle.load(handle)
message = 'This is a test'
vectorized = vectorizer.transform([message])
pred = classifier.predict(vectorized)
prob = classifier.predict_proba(vectorized)

In [55]:
print(pred)
print(prob)

[0]
[[0.93864924 0.06135079]]
