In [1]:
import re
import string
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score




# Recuperation de data

In [2]:
dataframe = pd.read_csv("../DATA/cleaned_data1.csv", names=["tweet", "class"]).iloc[2:,:]
dataframe["class"] = np.where(dataframe["class"] == "Positive",1,0)
dataframe = dataframe.sample(frac = 1)




# TF-IDF

In [3]:
cv = TfidfVectorizer()
y = dataframe['class'].to_list()
x = cv.fit_transform(dataframe['tweet'].astype(str))
print(x.shape)



(49864, 174741)


# Split Data

In [4]:
x_train, x_test, y_train,y_test= train_test_split(x,y,test_size=0.25,random_state=42 , stratify=y) 
#same random state 
#Stratify les classes non équilibré 

In [14]:
classifiers = [KNeighborsClassifier(n_neighbors=9),
               SVC(C= 2, gamma='scale', kernel= 'rbf'),
               DecisionTreeClassifier(criterion='entropy', splitter='best', max_features='sqrt', random_state=42, max_depth=None, max_leaf_nodes=500),
               RandomForestClassifier(bootstrap= 'False', criterion= 'entropy',max_features='log2', min_samples_split= 10,n_estimators= 150),
               MultinomialNB(alpha=1.0,  fit_prior='True'),
               BernoulliNB(alpha=1.0,  fit_prior='True' ,binarize=0.1)

]

In [15]:
def evaluate_model(y_test,predictions, file ,cls ):
    file.write("\n Le model : " + str(cls)+"\n")  
    file.write("Confusion Matrix : \n" + str(confusion_matrix(y_test,predictions))+"\n")  
    file.write("Classification Report : \n" + str(classification_report(y_test,predictions))+"\n")  
    file.write("Accuracy score : \n"+str(accuracy_score(y_test, predictions))+"\n")
    file.write("Recall Score : \n" + str(recall_score(y_test,predictions))+"\n")
    file.write("F1-score : \n" + str(f1_score(y_test, predictions, zero_division=1))+"\n")

In [16]:
results = open("../CODE/Results/Resultat_tf_idf.txt",'w')

for cls in classifiers: 
    #write_pattern('cls', str(cls), results)
    classifier = cls
    print(str(cls)+" ----------------------------------------------")
    classifier.fit(x_train, y_train)
    predictions = classifier.predict(x_test)
    evaluate_model(y_test,predictions,results,cls)


results.close()
    


KNeighborsClassifier(n_neighbors=9) ----------------------------------------------
SVC(C=2) ----------------------------------------------
DecisionTreeClassifier(criterion='entropy', max_features='sqrt',
                       max_leaf_nodes=500, random_state=42) ----------------------------------------------
RandomForestClassifier(bootstrap='False', criterion='entropy',
                       max_features='log2', min_samples_split=10,
                       n_estimators=150) ----------------------------------------------
MultinomialNB(fit_prior='True') ----------------------------------------------
BernoulliNB(binarize=0.1, fit_prior='True') ----------------------------------------------


In [7]:
neigh = KNeighborsClassifier(n_neighbors=9)
neigh.fit(x_train, y_train)

KNeighborsClassifier(n_neighbors=9)

In [8]:
y_predict = neigh.predict(x_test)

In [10]:
accuracy = accuracy_score(y_test, y_predict)

In [11]:
print(accuracy)

0.7168297769934221


In [35]:
SVC(C= 2, gamma='scale', kernel= 'rbf')

SVC(C=2)

In [36]:
sv = SVC(C= 2, gamma='scale', kernel= 'rbf')

In [42]:
sv.fit(x_train, y_train)

SVC(C=2)

In [43]:
y_predict = sv.predict(x_test)

In [46]:
accuracy = accuracy_score(y_test, y_predict)

In [47]:
print(accuracy)

0.8518369966308359


In [56]:

DecisionTreeClassifier(criterion='entropy', splitter='best', max_features='sqrt', random_state=42, max_depth=None, max_leaf_nodes=500)


DecisionTreeClassifier(criterion='entropy', max_features='sqrt',
                       max_leaf_nodes=500, random_state=42)

In [57]:
Dc= DecisionTreeClassifier(criterion='entropy', splitter='best', max_features='sqrt', random_state=42, max_depth=None, max_leaf_nodes=500)
Dc.fit(x_train, y_train)


DecisionTreeClassifier(criterion='entropy', max_features='sqrt',
                       max_leaf_nodes=500, random_state=42)

In [58]:
Dc.fit(x_train, y_train)


DecisionTreeClassifier(criterion='entropy', max_features='sqrt',
                       max_leaf_nodes=500, random_state=42)

In [59]:
y_predict = Dc.predict(x_test)

In [60]:
accuracy = accuracy_score(y_test, y_predict)

In [61]:
print(accuracy)

0.7189154500240654


In [63]:

Rfc = RandomForestClassifier(bootstrap= 'False', criterion= 'entropy',max_features='log2', min_samples_split= 10,n_estimators= 150)


In [64]:
Rfc.fit(x_train, y_train)


RandomForestClassifier(bootstrap='False', criterion='entropy',
                       max_features='log2', min_samples_split=10,
                       n_estimators=150)

In [65]:
y_predict = Rfc.predict(x_test)

In [66]:
accuracy = accuracy_score(y_test, y_predict)

In [67]:

print(accuracy)

0.824081501684582


In [70]:

Mnb = MultinomialNB(alpha=1.0,  fit_prior='True')

In [71]:
Mnb.fit(x_train, y_train)


MultinomialNB(fit_prior='True')

In [72]:
y_predict = Mnb.predict(x_test)

In [73]:
accuracy = accuracy_score(y_test, y_predict)

In [74]:
print(accuracy)

0.8066741536980587


In [75]:
Bnb = BernoulliNB(alpha=1.0,  fit_prior='True' ,binarize=0.1)


In [77]:
Bnb.fit(x_train, y_train)


BernoulliNB(binarize=0.1, fit_prior='True')

In [78]:
y_predict = Bnb.predict(x_test)

In [79]:
accuracy = accuracy_score(y_test, y_predict)

In [80]:
print(accuracy)

0.7729022942403337


In [98]:
from sklearn.naive_bayes import GaussianNB

Gnb = GaussianNB()
Gnb.fit(x_train, y_train)


TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [18]:
python --version


NameError: name 'python' is not defined

In [19]:
python -V

NameError: name 'python' is not defined