In [38]:
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt

In [39]:
# import the data
data = pd.read_csv("seg_row_data_Alakrot.csv")
print (data.head())

                               segmented_commentText Label
0  : O كاظم ي سوا كم بس انتم لان مقام ات ه ت درس ...     P
1                                   : سخط على ليكي\n     N
2  ? . . مسائ كم سعيد يا شباب , ممكن حد ي قول لى ...     N
3  ؟ ؟ ؟ ؟ ؟ ؟ فهم و نا بس هو انشهر ب لبنان و لا ...     N
4                                              " .\n     N


In [40]:
label_data = data[['segmented_commentText', 'Label']]
label_data[:5]

Unnamed: 0,segmented_commentText,Label
0,: O كاظم ي سوا كم بس انتم لان مقام ات ه ت درس ...,P
1,: سخط على ليكي\n,N
2,"? . . مسائ كم سعيد يا شباب , ممكن حد ي قول لى ...",N
3,؟ ؟ ؟ ؟ ؟ ؟ فهم و نا بس هو انشهر ب لبنان و لا ...,N
4,""" .\n",N


In [41]:
def clean_text(x):
    return str(x)

label_data["segmented_commentText"] = label_data["segmented_commentText"].apply(lambda x: clean_text(x))

In [42]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(label_data, test_size=0.2)

In [43]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, recall_score, precision_score
from time import time

max_features = 9000

x = train['segmented_commentText']
y = train['Label']


tfidf = TfidfVectorizer(max_features=max_features)

pipeline = Pipeline([
    ('tfidf', tfidf),
    ('NB', MultinomialNB(alpha=1)),
])

parameters = {
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2')   
}

grid_search = GridSearchCV(pipeline, parameters, cv=5,
                               n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(x, y)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    
#refitting on entire training data using best settings
grid_search.refit

x_test = test['segmented_commentText']
y_test = test['Label']

predicted = grid_search.predict(x_test)
print("accuracy=", np.mean(predicted == y_test))
print("recall=", recall_score(y_test, predicted, average='weighted') )
print("precision=", precision_score(y_test, predicted, average='weighted'))
print("weighted f-score", f1_score(y_test, predicted, average='weighted'))
print("Confusion matrix\n", confusion_matrix(y_test, predicted))

Performing grid search...
pipeline: ['tfidf', 'NB']
parameters:
{'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2')}
Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    4.7s finished


done in 4.875s

Best score: 0.613
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: False
accuracy= 0.6187471834159531
recall= 0.6187471834159531
precision= 0.6438319930694468
weighted f-score 0.5267189547504336
Confusion matrix
 [[1265   49]
 [ 797  108]]
