In [7]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt

In [8]:
# import the data
data = pd.read_csv("segmented_dataset_M.csv")
print(data.head())

                                      segmented_body  languagecomment
0   ( قال و ل ل حرامي احلف ) و ال كذب من شيم ال م...               -1
1   ( وما رمي ت اذ رمي ت لكن الله رمى )  هؤلاء هم...                0
2  ‫ #  تصحيح _ أوضاع _ ال سوري ين _ يا _ سلمان _...                0
3              ‫ #  هل _ ت رى _ ل هم _ من _ باقي ة ‬               -1
4  ‫ # ايران‬ تدعم ‫ #  ال حوثي‬ ل إجتياح ال جنوب...                0


In [9]:
label_data = data[['segmented_body', 'languagecomment']]
label_data[:5]

Unnamed: 0,segmented_body,languagecomment
0,( قال و ل ل حرامي احلف ) و ال كذب من شيم ال م...,-1
1,( وما رمي ت اذ رمي ت لكن الله رمى ) هؤلاء هم...,0
2,‫ # تصحيح _ أوضاع _ ال سوري ين _ يا _ سلمان _...,0
3,‫ # هل _ ت رى _ ل هم _ من _ باقي ة ‬,-1
4,‫ # ايران‬ تدعم ‫ # ال حوثي‬ ل إجتياح ال جنوب...,0


In [10]:
from sklearn.model_selection import train_test_split
from scipy import stats

train, test = train_test_split(label_data, test_size=0.2)

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

from sklearn import svm
from sklearn.svm import SVC, LinearSVC # "Support vector classifier"
model = SVC(kernel='linear', C=1E10)
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score
from time import time

max_features = 5000

x = train['segmented_body']
y = train['languagecomment']


tfidf = TfidfVectorizer(max_features=max_features)

pipeline = Pipeline([
    ('tfidf', tfidf),
    ('SVM', svm.SVC(kernel='linear')),
])

parameters = {
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2')   
}

grid_search = GridSearchCV(pipeline, parameters, cv=5,
                               n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(x, y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
#refitting on entire training data using best settings
grid_search.refit

x_test = test['segmented_body']
y_test = test['languagecomment']

predicted = grid_search.predict(x_test)
print("accuracy=", np.mean(predicted == y_test))
print("recall=", recall_score(y_test, predicted, average='weighted') )
print("precision=", precision_score(y_test, predicted, average='weighted'))
print("weighted f-score", f1_score(y_test, predicted, average='weighted'))
print("Confusion matrix\n", confusion_matrix(y_test, predicted))

Performing grid search...
pipeline: ['tfidf', 'SVM']
parameters:
{'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2')}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 13.1min finished


done in 951.329s

Best score: 0.804
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
accuracy= 0.8065635847270433
recall= 0.8065635847270433
precision= 0.6507747528340496
weighted f-score 0.7203422897308948
Confusion matrix
 [[   0  106    0]
 [   0 5112    1]
 [   0 1119    0]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
