In [1]:
import pandas as pd

In [4]:
df = pd.read_csv("naive_bayes\\df_to_train.csv")
df['Total_Review'] = df['Total_Review'].fillna('')
df.review_type.value_counts()

review_type
positive    10728
neutral     10728
negative    10728
Name: count, dtype: int64

In [5]:
df.review_type = df['review_type'].map({'positive': 1, 'neutral': 0, 'negative': -1})
df.review_type.value_counts()

review_type
 1    10728
 0    10728
-1    10728
Name: count, dtype: int64

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

# cria objeto CountVectorizer. O parâmetro ngram_range=(1, 1) significa que o vetorizador 
# irá considerar apenas unigramas, cada token individual será considerado como uma unidade
vect = CountVectorizer(ngram_range=(1, 1))

# ajusta o vetorizador aos dados de texto fornecido
vect.fit(df.Total_Review)

with open('count_vectorizer.pkl', 'wb') as f:
    pickle.dump(vect, f)

# transforma os textos em uma representação de matriz de contagem de tokens.
text_vect = vect.transform(df.Total_Review)

In [7]:
from sklearn.model_selection import train_test_split

# divide os dados entre treino e teste
X_train,X_test,y_train,y_test = train_test_split(
    text_vect, 
    df.review_type,
    test_size = 0.3, 
    random_state = 42
)

In [8]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='newton-cg')
clf = clf.fit(X_train, y_train)

In [9]:
from sklearn.metrics import f1_score
y_prediction = clf.predict(X_test)

f1 = f1_score(y_prediction, y_test, average='weighted')

print(f1)

0.7641925174545442


In [10]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred = clf.predict(X_train)
        clf_report = pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("Train Result:\n================================================")
        print(f"Accuracy Score: {accuracy_score(y_train, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_train, pred)}\n")
        
    elif train==False:
        pred = clf.predict(X_test)
        clf_report = pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("Test Result:\n================================================")        
        print(f"Accuracy Score: {accuracy_score(y_test, pred) * 100:.2f}%")
        print("_______________________________________________")
        print(f"CLASSIFICATION REPORT:\n{clf_report}")
        print("_______________________________________________")
        print(f"Confusion Matrix: \n {confusion_matrix(y_test, pred)}\n")

In [11]:
from sklearn.model_selection import train_test_split

# divide os dados entre treino e teste
X_train,X_test,y_train,y_test = train_test_split(
    text_vect, 
    df.review_type,
    test_size = 0.2, 
    random_state = 42
)

In [12]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='liblinear')
clf = clf.fit(X_train, y_train)

print_score(clf, X_train, y_train, X_test, y_test, train=True)
print_score(clf, X_train, y_train, X_test, y_test, train=False)

Train Result:
Accuracy Score: 87.58%
_______________________________________________
CLASSIFICATION REPORT:
                    -1            0            1  accuracy     macro avg  \
precision     0.843244     0.912309     0.878059  0.875791      0.877871   
recall        0.882196     0.793840     0.951402  0.875791      0.875812   
f1-score      0.862280     0.848961     0.913260  0.875791      0.874834   
support    8616.000000  8571.000000  8560.000000  0.875791  25747.000000   

           weighted avg  
precision      0.877810  
recall         0.875791  
f1-score       0.874796  
support    25747.000000  
_______________________________________________
Confusion Matrix: 
 [[7601  460  555]
 [1191 6804  576]
 [ 222  194 8144]]

Test Result:
Accuracy Score: 75.89%
_______________________________________________
CLASSIFICATION REPORT:
                    -1            0            1  accuracy    macro avg  \
precision     0.704766     0.738095     0.825093  0.758894     0.755985   


In [13]:
test_score = accuracy_score(y_test, clf.predict(X_test)) * 100
train_score = accuracy_score(y_train, clf.predict(X_train)) * 100

results_df = pd.DataFrame(data=[["Logistic Regression", train_score, test_score]], 
                          columns=['Model', 'Training Accuracy %', 'Testing Accuracy %'])
results_df

Unnamed: 0,Model,Training Accuracy %,Testing Accuracy %
0,Logistic Regression,87.579135,75.889389


In [14]:
import pickle

# salvar o modelo em um arquivo
with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(clf, f)