# EVALUATION

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pylab as pl  
import plotly.express as px

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from other import tools

In [4]:
# clean the tweet and save it on a new column

def clean_tweet(tweet):

  new_tweet = ''
  words = tweet.split()
  for w in words:
    wl = w.lower() # word in lower case
    
    # replace numbers
    for number, new_value in tools.replace_numbers_dict.items():
      wl = str(wl).replace(number,new_value)

    # replace some punctuations mark to keep only words
    for punctuation, new_value in tools.replace_punctuation_dict.items():
      wl = wl.replace(punctuation,new_value)
    
    # replace accent mark 
    for accent, new_value in tools.replace_accent_dict.items():
      wl = wl.replace(accent,new_value)

    if wl not in tools.model_stop_wprds:
      new_tweet += f'{wl} '
  
  return new_tweet[:-1]

In [5]:
df_rating=pd.read_csv('data/sentiment_1_0.csv')
df_sent = pd.read_csv('https://raw.githubusercontent.com/GDLPLearning/Sentiment-Analysis-for-the-MDP/master/notebooks/Exploratory/data/tweets_interim.csv')

In [6]:
df_rating['tweet'] = df_rating['full_text'].apply(clean_tweet)
df_sent['tweet'] = df_sent['full_text'].apply(clean_tweet)

In [7]:
# split the data into train and test data
tweets_train, tweets_test = train_test_split(df_rating, test_size=0.2, random_state=0)

In [8]:
vectorizer = CountVectorizer()
x_train_bow = vectorizer.fit_transform(tweets_train['tweet'])
y_train_bow = tweets_train['sentiment']

In [9]:
# Training the model 
lr_model_all = LogisticRegression(C=1, solver="liblinear")
lr_model_all.fit(x_train_bow, y_train_bow)

LogisticRegression(C=1, solver='liblinear')

In [10]:
vectorizer = CountVectorizer()
x_train_bow = vectorizer.fit_transform(tweets_train['tweet'])
x_predic_bow = vectorizer.transform(df_sent['tweet'])

In [11]:
# Predicting the output
hist_pred_lr_prob = lr_model_all.predict_proba(x_predic_bow)
hist_pred_lr_all = lr_model_all.predict(x_predic_bow)

In [12]:
probabilities = [each[1] for each in hist_pred_lr_prob]

In [13]:
df_sent['predicted_sentiment'] = hist_pred_lr_all
df_sent['Predicted_probability'] = probabilities

In [14]:
df_sent

Unnamed: 0,full_text,key_word,hour,day_of_week,month,year,tweet,predicted_sentiment,Predicted_probability
0,Me acordé que a los colombianos de mi ex trab...,trabajo,22,4,1,2020,acorde colombianos ex trabajo gustaba medellin...,1,0.552759
1,@JoseFBarriosG @VLADDO @Uber Como van a apoya...,trabajo,22,4,1,2020,josefbarriosg vladdo uber apoyar uber disminuy...,1,0.577848
2,En Tierragro sabemos que la labor en el campo...,trabajo,19,4,1,2020,tierragro sabemos labor campo implica tener bu...,0,0.491671
3,@seguridadmed PARA NADIE ES UN SECRETO GENERA...,trabajo,19,4,1,2020,seguridadmed nadie secreto general mas facil c...,0,0.353633
4,@kmilohurtado_81 @SaqueLargoWin Tengo casi un...,trabajo,18,4,1,2020,kmilohurtado saquelargowin tengo casi decena a...,0,0.398427
...,...,...,...,...,...,...,...,...,...
244497,@lumalo El desorden en todo lo relacionado a ...,movilidad,15,5,5,2022,lumalo desorden relacionado movilidad transito...,0,0.412423
244498,#Movilidad ¡Taxista! Así rotará el pico y pla...,movilidad,15,5,5,2022,movilidad taxista rotara pico placa medellin s...,0,0.330331
244499,Faltan días para dar inicio a nuestras giras ...,movilidad,1,5,5,2022,faltan dar inicio nuestras giras presenciales ...,1,0.501993
244500,@sttmed señores secretaria de la movilidad de...,movilidad,0,5,5,2022,sttmed señores movilidad ciudad medellin pregu...,0,0.418485
