In [8]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt

In [9]:
data = pd.read_csv('seg_clean_data.csv', error_bad_lines=False)
print (data.head())

                                      segmented_body  languagecomment
0  ‫ # ايران‬ تدعم ‫ # ال حوثي‬ ل اجتياح ال جنوبو...                0
1  ‫ # تصحيح _ اوضاع _ ال سوري ين _ يا _ سلمان _ ...                0
2               ‫ # هل _ ت رى _ ل هم _ من _ باقي ه ‬               -1
3  - ال انقلابي ين هم ال قتل ه و ال ان هم ي قتل و...               -1
4  - ف ماذا عن حال - ال ولاء ات / ال انبطح ات / ....               -1


In [10]:
data.columns = data.columns.to_series().apply(lambda x: x.strip())

In [11]:
label_data = data[['segmented_body', 'languagecomment']]
label_data[:5]

Unnamed: 0,segmented_body,languagecomment
0,‫ # ايران‬ تدعم ‫ # ال حوثي‬ ل اجتياح ال جنوبو...,0
1,‫ # تصحيح _ اوضاع _ ال سوري ين _ يا _ سلمان _ ...,0
2,‫ # هل _ ت رى _ ل هم _ من _ باقي ه ‬,-1
3,- ال انقلابي ين هم ال قتل ه و ال ان هم ي قتل و...,-1
4,- ف ماذا عن حال - ال ولاء ات / ال انبطح ات / ....,-1


In [12]:
# to remove blank rows though the code didn't help, so I remove them manually for the file itself:
# import re
# test_data = label_data[:100]
# for row in test_data:
#     if len(row.strip()) > 2:
#         test_data.drop([row], axis=0)
# #         pd.row.replace(row, '')
# print (test_data)
#         test_data.replace(row, '')
# f = test_data['segmented_body'].str.extract('([^أ-ي]+/g)', expand = False).dropna()
# df['col1'].str.extract('([A-Za-z]+\d+)', expand = False).dropna()


# for row in test_data:
#     if len(row.strip()) < 2:
#         row.replace(row, '')

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(label_data, test_size=0.2)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score
from time import time

max_features = 5000

x = train['segmented_body']
y = train['languagecomment']


tfidf = TfidfVectorizer(max_features=max_features)

pipeline = Pipeline([
    ('tfidf', tfidf),
    ('NB', MultinomialNB(alpha=1)),
])

parameters = {
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2')   
}

grid_search = GridSearchCV(pipeline, parameters, cv=5,
                               n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(x, y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
#refitting on entire training data using best settings
grid_search.refit

x_test = test['segmented_body']
y_test = test['languagecomment']

predicted = grid_search.predict(x_test)
print("accuracy=", np.mean(predicted == y_test))
print("recall=", recall_score(y_test, predicted, average='weighted') )
print("precision=", precision_score(y_test, predicted, average='weighted'))
print("weighted f-score", f1_score(y_test, predicted, average='weighted'))
print("Confusion matrix\n", confusion_matrix(y_test, predicted))

Performing grid search...
pipeline: ['tfidf', 'NB']
parameters:
{'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2')}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   10.4s finished


done in 10.982s

Best score: 0.805
Best parameters set:
	tfidf__norm: 'l1'
	tfidf__use_idf: True
accuracy= 0.8041969075418113
recall= 0.8041969075418113
precision= 0.6467326660998126
weighted f-score 0.716920268953321
Confusion matrix
 [[   0   97    0]
 [   0 5097    0]
 [   0 1144    0]]


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
