In [1]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.2)
# Pickle package
import pickle

In [2]:
# import the data
data = pd.read_csv("clean_data.csv")
print(data.head())

   Unnamed: 0                                               body  \
0           0  يبغى التنبيه على ان السعوديه  تستخدم صواريخ جو...   
1           1  امريكا قتلت بالامس معوق رفض رفع يديه فماذا تري...   
2           2  هذا الشخص هو من كان مؤيد لاحتلال العراق وضرب ا...   
3           3  الى جمال ريان مذيع الجزيره  نحن من رعاك فى الم...   
4           4  خيبه  الامل ليست تشاؤما ولا تقولا    ٠   عزم ل...   

   languagecomment  
0               -1  
1               -1  
2               -1  
3               -1  
4                0  


In [3]:
# # Plot two ingredients
# sns.lmplot('body', 'languagecomment', data=data, hue='Type',
#            palette='Set1', fit_reg=False, scatter_kws={"s": 70});

In [4]:
label_data = data[['body', 'languagecomment']]
label_data[120:130]

Unnamed: 0,body,languagecomment
120,جيش الهر العلمانى يكاد يكون حبر على الورق ، ...,-1
121,شكرا الجزيره على دعمكم للشعب السورى والثوره ...,-1
122,ومن اعرض عن ذكرى فان له معيشه ضنكى . ...,0
123,للمره العاشره . . . القافله ...,0
124,التقئ,-1
125,لا جديد كذب فى كذبنفس تعليقات زعيم الجنوب اللب...,-1
126,ولما نتنياهو سيقسم الاقصى هل لكم خصائل الرجال,-1
127,اضرب يا سيسى بيد من حديد على الاخوان الارهابيي...,-1
128,كس امكم قناه حقيره ارهابيه اغونجيه وداعشيه...,-2
129,ذلك اليهودى يسعى لتوسيع حرب السوريين واين السن...,-1


In [5]:
from sklearn.model_selection import train_test_split
from scipy import stats

train, test = train_test_split(label_data, test_size=0.2)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

from sklearn import svm
from sklearn.svm import SVC, LinearSVC # "Support vector classifier"
model = SVC(kernel='linear', C=1E10)
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score
from time import time

max_features = 5000

x = train['body']
y = train['languagecomment']


tfidf = TfidfVectorizer(max_features=max_features)

pipeline = Pipeline([
    ('tfidf', tfidf),
    ('SVM', SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf', max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False))
]) # other parameters:
    # SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
    # svm.SVC(kernel='rbf')
    # svm.SVC(kernel='linear')
    
parameters = {
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2')   
}

grid_search = GridSearchCV(pipeline, parameters, cv=5,
                               n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(x, y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
#refitting on entire training data using best settings
grid_search.refit

x_test = test['body']
y_test = test['languagecomment']

predicted = grid_search.predict(x_test)
print("accuracy=", np.mean(predicted == y_test))
print("recall=", recall_score(y_test, predicted, average='weighted') )
print("precision=", precision_score(y_test, predicted, average='weighted'))
print("weighted f-score", f1_score(y_test, predicted, average='weighted'))
print("Confusion matrix\n", confusion_matrix(y_test, predicted))

Performing grid search...
pipeline: ['tfidf', 'SVM']
parameters:
{'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2')}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 12.7min finished


done in 59504.734s

Best score: 0.819
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
accuracy= 0.8110112005048115
recall= 0.8110112005048115
precision= 0.8064850698839671
weighted f-score 0.7459707077424758
Confusion matrix
 [[  14   83    0]
 [   2 5021   30]
 [   0 1083  106]]
