In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings, string
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import joblib

In [2]:
df = pd.read_csv('../data/preprocessed_dataset.csv')

In [3]:
df.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
df.dropna(inplace=True)

In [5]:
def text_process(review):
    nopunc = [char for char in review if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [6]:
bow_transformer = CountVectorizer(analyzer=text_process)
bow_transformer

In [7]:
bow_transformer.fit(df['text_'])
print("Total Vocabulary:",len(bow_transformer.vocabulary_))

Total Vocabulary: 34489


In [8]:
bow_reviews = bow_transformer.transform(df['text_'])

In [9]:
tfidf_transformer = TfidfTransformer().fit(bow_reviews)

In [10]:
tfidf_reviews = tfidf_transformer.transform(bow_reviews)

## Creating training and testing data

In [11]:
review_train, review_test, label_train, label_test = train_test_split(df['text_'],df['label'],test_size=0.2)

In [12]:
review_test.to_csv('../data/review_test.csv')



## Training and testing Multinomial Naive Bayes Algorithm on the preprocessed data

In [13]:
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_process)),
    ('tfidf',TfidfTransformer()),
    ('classifier',SVC())
])

In [14]:
pipeline.fit(review_train,label_train)

In [15]:
svc_pred = pipeline.predict(review_test)
svc_pred

array(['OR', 'CG', 'CG', ..., 'OR', 'OR', 'CG'], dtype=object)

In [16]:
print('Classification Report:',classification_report(label_test,svc_pred))
print('Confusion Matrix:',confusion_matrix(label_test,svc_pred))
print('Accuracy Score:',accuracy_score(label_test,svc_pred))
print('Model Prediction Accuracy:',str(np.round(accuracy_score(label_test,svc_pred)*100,2)) + '%')

Classification Report:               precision    recall  f1-score   support

          CG       0.91      0.87      0.89      4074
          OR       0.88      0.91      0.89      4013

    accuracy                           0.89      8087
   macro avg       0.89      0.89      0.89      8087
weighted avg       0.89      0.89      0.89      8087

Confusion Matrix: [[3561  513]
 [ 367 3646]]
Accuracy Score: 0.8911833807345122
Model Prediction Accuracy: 89.12%


In [17]:
joblib.dump(pipeline, '../data/modello_addestrato.pkl')

['../data/modello_addestrato.pkl']

In [18]:
# Carica il modello addestrato
model_path = '../data/modello_addestrato.pkl'  # Aggiorna il percorso se necessario
model = joblib.load(model_path)

value = 19

elemento_selezionato = review_test.iloc[value]

# Creiamo una nuova Series che contiene solo l'elemento selezionato
# Usiamo un dizionario per mantenere l'indice originale
nuova_serie = pd.Series({elemento_selezionato.index: elemento_selezionato})

print(nuova_serie)


prediction = model.predict([nuova_serie])

print(prediction)

<built-in method index of str object at 0x0000022BCC0BFBA0>    gun look great look real materi good qualiti d...
dtype: object
['CG']
