In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier

from sklearn.metrics import classification_report

from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# load in data
df = pd.read_csv('/home/j/Documents/Projects/sentimental/data/annotated_1-12-23.csv')

df.head()

Unnamed: 0.1,Unnamed: 0,idx,msg_txt,annotation
0,0,0,"In Kyiv, fragments of a downed rocket damage...",1
1,1,1,"Kyiv region is attacked by drones again, air...",1
2,2,2,Explosions are also heard in the capital. Ai...,1
3,3,4,Kyiv region is attacked by drones - Kuleba ...,1
4,5,5,The rocket that fell in Shevchenkivskyi dist...,0


In [3]:
X_train, X_test , y_train, y_test = train_test_split(df['msg_txt'].values,df['annotation'].values)

First, I'll use get a TFID matrix for the messages.

In [4]:
vectorizer = TfidfVectorizer(sublinear_tf=True)

freq_matrix_train = vectorizer.fit_transform(X_train)
freq_matrix_test = vectorizer.transform(X_test)

In [5]:
classifier = RandomizedSearchCV(XGBClassifier(), {'n_estimators': [int(x) for x in range(200,2000,200)],
        'min_child_weight': range(1,6,2),
        'gamma': [i/10.0 for i in range(0,5)],
        'subsample': [i/10.0 for i in range(6,10)],
        'colsample_bytree': [i/10.0 for i in range(6,10)],
        'max_depth': [3, 6, 9],
        'opt':[1e-5, 1e-2, 0.1, 1, 100],
        'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]
        })

classifier.fit(freq_matrix_train, y_train)

y_pred = classifier.predict(freq_matrix_test)

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.92      0.88       171
           1       0.74      0.59      0.66        68

    accuracy                           0.82       239
   macro avg       0.79      0.75      0.77       239
weighted avg       0.82      0.82      0.82       239



In [6]:
classifier.save_model('../models/xgb_classifier')



In [7]:
""" import pickle
with open('../models/tfid-vectorizer.pickle', 'wb') as handle:
    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL) """

" import pickle\nwith open('../models/tfid-vectorizer.pickle', 'wb') as handle:\n    pickle.dump(vectorizer, handle, protocol=pickle.HIGHEST_PROTOCOL) "

In [9]:
import pickle
classifier = XGBClassifier()
classifier.load_model('/home/j/Documents/Projects/social-media-combat-detection/models/xgb_classifier')
with open('/home/j/Documents/Projects/social-media-combat-detection/models/tfid-vectorizer.pickle', 'rb') as handle:
        vectorizer = pickle.load(handle)
message = 'This is a test'
vectorized = vectorizer.transform([message])
pred = classifier.predict(vectorized)
prob = classifier.predict_proba(vectorized)

In [10]:
print(pred)
print(prob)

[0]
[[0.96803504 0.03196494]]
