# Imports

In [109]:
import os
from pathlib import Path
import re
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline


# Ao definirmos os valores para essas variáveis de ambiente,
#   conseguimos acessar os dados guardados no servidor remoto!
os.environ['MLFLOW_TRACKING_URI']= "http://34.171.98.242:8080"
os.environ['MLFLOW_TRACKING_USERNAME']= "mlflow"
os.environ['MLFLOW_TRACKING_PASSWORD']= "aibox-workshop-2023"

# Parâmetros


In [110]:
df = pd.read_csv("../../data/sentiment_tweets3.csv", encoding='latin')
columns = {
    'message to examine': 'msg',
    'label (depression result)': 'label'
}
df.rename(columns=columns, inplace=True)
df.head()

Unnamed: 0,Index,msg,label
0,106,just had a real good moment. i missssssssss hi...,0
1,217,is reading manga http://plurk.com/p/mzp1e,0
2,220,@comeagainjen http://twitpic.com/2y2lx - http:...,0
3,288,@lapcat Need to send 'em to my accountant tomo...,0
4,540,ADD ME ON MYSPACE!!! myspace.com/LookThunder,0


# DataSet


In [111]:
XTrain, XTest, ytrain, ytest = train_test_split(df['msg'], df['label'], test_size=0.2, random_state=102)

#TF-IDF
tfidf_vectorizer = TfidfVectorizer(lowercase=True, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(XTrain)
X_test_tfidf = tfidf_vectorizer.transform(XTest)

# Modelo


In [112]:
# model = svm.SVC()
# model.fit(X_train_tfidf, ytrain)


svm_pipeline = Pipeline([
    ("tfidf_svm", TfidfVectorizer(stop_words="english", lowercase=True)),
    ("classifier", svm.SVC())
])

svm_pipeline.fit(XTrain,ytrain)


# Experimento

In [113]:
y_pred = svm_pipeline.predict(XTest)
y_pred = y_pred >= 0.5
report = classification_report(ytest,y_pred)
print(report)

print("Acuracia", accuracy_score(ytest, y_pred))
print("\n Relatorio de Classificação: \n", classification_report(ytest, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1603
           1       1.00      0.98      0.99       460

    accuracy                           1.00      2063
   macro avg       1.00      0.99      0.99      2063
weighted avg       1.00      1.00      1.00      2063

A frase foi classificada como: [False False  True ... False False  True]
Acuracia 0.9951526902569074

 Relatorio de Classificação: 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      1603
           1       1.00      0.98      0.99       460

    accuracy                           1.00      2063
   macro avg       1.00      0.99      0.99      2063
weighted avg       1.00      1.00      1.00      2063



In [130]:
text_SVM = df[df['label'] == 0].iloc[1].to_string()
text_SVM
svm_pipeline.predict([text_SVM])

array([0])

# Classifier Random Forest Classifier


In [121]:
from sklearn.pipeline import Pipeline

rf_pipeline = Pipeline(
    [
        ('tfidf', TfidfVectorizer(stop_words="english", lowercase=True)),
        ('classifier', RandomForestClassifier(n_estimators=200))
    ]
)

rf_pipeline.fit(XTrain, ytrain)
y_pred = rf_pipeline.predict(XTest)
y_pred = y_pred >= 0.5
report = classification_report(ytest, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1603
           1       1.00      0.99      0.99       460

    accuracy                           1.00      2063
   macro avg       1.00      1.00      1.00      2063
weighted avg       1.00      1.00      1.00      2063



# Teste

In [128]:
text = df[df['label'] == 0].iloc[0].to_string()
text

'Index                                                  106\nmsg      just had a real good moment. i missssssssss hi...\nlabel                                                    0'

In [129]:
rf_pipeline.predict([text])

array([0])

In [122]:
import numpy as np

frase_teste = "I am sad!"

# frase_teste_processada = pd.Series([frase_teste]).apply(lambda x: x.lower())
frase_test_idf = tfidf_vectorizer.transform([frase_teste])
np.array(frase_test_idf)

resultado = model.predict(frase_test_idf)

print(f"A frase foi classificada como: {resultado}")

A frase foi classificada como: [0]
