# EVALUATION

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pylab as pl  
import plotly.express as px

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
from other import tools

In [26]:
# clean the tweet and save it on a new column

def clean_tweet(tweet):

  new_tweet = ''
  words = tweet.split()
  for w in words:
    wl = w.lower() # word in lower case
    
    # replace numbers
    for number, new_value in tools.replace_numbers_dict.items():
      wl = str(wl).replace(number,new_value)

    # replace some punctuations mark to keep only words
    for punctuation, new_value in tools.replace_punctuation_dict.items():
      wl = wl.replace(punctuation,new_value)
    
    # replace accent mark 
    for accent, new_value in tools.replace_accent_dict.items():
      wl = wl.replace(accent,new_value)

    if wl not in tools.model_stop_wprds:
      new_tweet += f'{wl} '
  
  return new_tweet[:-1]

In [27]:
df_rating=pd.read_csv('data/sentiment_1_0.csv')
df_sent = pd.read_csv('https://raw.githubusercontent.com/GDLPLearning/Sentiment-Analysis-for-the-MDP/master/notebooks/Exploratory/data/tweets_interim.csv')

In [28]:
df_rating['tweet'] = df_rating['full_text'].apply(clean_tweet)
df_sent['tweet'] = df_sent['full_text'].apply(clean_tweet)

In [29]:
# split the data into train and test data
tweets_train, tweets_test = train_test_split(df_rating, test_size=0.2, random_state=0)

In [30]:
vectorizer = CountVectorizer()
x_train_bow = vectorizer.fit_transform(tweets_train['tweet'])
y_train_bow = tweets_train['sentiment']

In [31]:
# Training the model 
lr_model_all = LogisticRegression(C=1, solver="liblinear")
lr_model_all.fit(x_train_bow, y_train_bow)

LogisticRegression(C=1, solver='liblinear')

In [32]:
vectorizer = CountVectorizer()
x_train_bow = vectorizer.fit_transform(tweets_train['tweet'])
x_predic_bow = vectorizer.transform(df_sent['tweet'])

In [33]:
# Predicting the output
hist_pred_lr_prob = lr_model_all.predict_proba(x_predic_bow)
hist_pred_lr_all = lr_model_all.predict(x_predic_bow)

In [34]:
probabilities = [each[1] for each in hist_pred_lr_prob]

In [35]:
df_sent['predicted_sentiment'] = hist_pred_lr_all
df_sent['Predicted_probability'] = probabilities

In [36]:
df_sent

Unnamed: 0,full_text,key_word,hour,day_of_week,month,year,tweet,predicted_sentiment,Predicted_probability
0,Me acordé que a los colombianos de mi ex trab...,trabajo,22,4,1,2020,acorde colombianos ex trabajo gustaba medellin...,1,0.552759
1,@JoseFBarriosG @VLADDO @Uber Como van a apoya...,trabajo,22,4,1,2020,josefbarriosg vladdo uber apoyar uber disminuy...,1,0.577848
2,En Tierragro sabemos que la labor en el campo...,trabajo,19,4,1,2020,tierragro sabemos labor campo implica tener bu...,0,0.491671
3,@seguridadmed PARA NADIE ES UN SECRETO GENERA...,trabajo,19,4,1,2020,seguridadmed nadie secreto general mas facil c...,0,0.353633
4,@kmilohurtado_81 @SaqueLargoWin Tengo casi un...,trabajo,18,4,1,2020,kmilohurtado saquelargowin tengo casi decena a...,0,0.398427
...,...,...,...,...,...,...,...,...,...
244497,@lumalo El desorden en todo lo relacionado a ...,movilidad,15,5,5,2022,lumalo desorden relacionado movilidad transito...,0,0.412423
244498,#Movilidad ¡Taxista! Así rotará el pico y pla...,movilidad,15,5,5,2022,movilidad taxista rotara pico placa medellin s...,0,0.330331
244499,Faltan días para dar inicio a nuestras giras ...,movilidad,1,5,5,2022,faltan dar inicio nuestras giras presenciales ...,1,0.501993
244500,@sttmed señores secretaria de la movilidad de...,movilidad,0,5,5,2022,sttmed señores movilidad ciudad medellin pregu...,0,0.418485


In [43]:
df_sent.replace({'predicted_sentiment': {0: 'Negative', 1: 'Positive'}}, inplace=True)

In [44]:
df_sent.head()

Unnamed: 0,full_text,key_word,hour,day_of_week,month,year,tweet,predicted_sentiment,Predicted_probability
0,Me acordé que a los colombianos de mi ex trab...,trabajo,22,4,1,2020,acorde colombianos ex trabajo gustaba medellin...,Positive,0.552759
1,@JoseFBarriosG @VLADDO @Uber Como van a apoya...,trabajo,22,4,1,2020,josefbarriosg vladdo uber apoyar uber disminuy...,Positive,0.577848
2,En Tierragro sabemos que la labor en el campo...,trabajo,19,4,1,2020,tierragro sabemos labor campo implica tener bu...,Negative,0.491671
3,@seguridadmed PARA NADIE ES UN SECRETO GENERA...,trabajo,19,4,1,2020,seguridadmed nadie secreto general mas facil c...,Negative,0.353633
4,@kmilohurtado_81 @SaqueLargoWin Tengo casi un...,trabajo,18,4,1,2020,kmilohurtado saquelargowin tengo casi decena a...,Negative,0.398427


In [48]:
df_sent.groupby(['month','year','key_word','predicted_sentiment']).size().to_frame().reset_index().rename(columns={0: 'frequency'})

Unnamed: 0,month,year,key_word,predicted_sentiment,frequency
0,1,2019,cultura,Negative,189
1,1,2019,cultura,Positive,308
2,1,2019,empresa,Negative,273
3,1,2019,empresa,Positive,35
4,1,2019,jovenes,Negative,131
...,...,...,...,...,...
719,12,2021,tecnologia,Positive,82
720,12,2021,trabajo,Negative,386
721,12,2021,trabajo,Positive,574
722,12,2021,vida,Negative,568


In [74]:
df_sent.head()

Unnamed: 0,full_text,key_word,hour,day_of_week,month,year,tweet,predicted_sentiment,Predicted_probability,month-year
0,Me acordé que a los colombianos de mi ex trab...,trabajo,22,4,1,2020,acorde colombianos ex trabajo gustaba medellin...,Positive,0.552759,2020-01-01
1,@JoseFBarriosG @VLADDO @Uber Como van a apoya...,trabajo,22,4,1,2020,josefbarriosg vladdo uber apoyar uber disminuy...,Positive,0.577848,2020-01-01
2,En Tierragro sabemos que la labor en el campo...,trabajo,19,4,1,2020,tierragro sabemos labor campo implica tener bu...,Negative,0.491671,2020-01-01
3,@seguridadmed PARA NADIE ES UN SECRETO GENERA...,trabajo,19,4,1,2020,seguridadmed nadie secreto general mas facil c...,Negative,0.353633,2020-01-01
4,@kmilohurtado_81 @SaqueLargoWin Tengo casi un...,trabajo,18,4,1,2020,kmilohurtado saquelargowin tengo casi decena a...,Negative,0.398427,2020-01-01


In [57]:
pd.crosstab(df_sent[df_sent['year']==2022]['key_word'],df_sent[df_sent['year']==2022]['predicted_sentiment']).reset_index()

predicted_sentiment,key_word,Negative,Positive
0,cultura,1074,1680
1,empresa,1919,176
2,jovenes,1511,1424
3,metro,4686,516
4,movilidad,1721,452
5,seguridad,4372,896
6,tecnologia,211,524
7,trabajo,2392,3052
8,vida,2711,3648


In [67]:
df_sent_freq=pd.DataFrame()
for year in df_sent['year'].unique():
    df_cross=pd.crosstab(df_sent[df_sent['year']==year]['key_word'],df_sent[df_sent['year']==year]['predicted_sentiment']).reset_index()
    df_cross['total']=df_cross.sum(axis=1)
    df_cross['year']=year
    df_cross['freq_pos']=df_cross['Positive']/df_cross['total']
    df_cross=df_cross[['year','key_word','freq_pos']]
    df_sent_freq=df_sent_freq.append(df_cross)
df_sent_freq.reset_index(drop=True,inplace=True)
df_sent_freq=df_sent_freq.sort_values(by=['year','key_word'],ascending=True)
df_sent_freq.head(50)
    

predicted_sentiment,year,key_word,freq_pos
18,2019,cultura,0.636192
19,2019,empresa,0.155399
20,2019,jovenes,0.634869
21,2019,metro,0.116343
22,2019,movilidad,0.333698
23,2019,seguridad,0.314616
24,2019,tecnologia,0.67429
25,2019,trabajo,0.71868
26,2019,vida,0.687711
0,2020,cultura,0.626825


In [69]:
px.line(df_sent_freq, x='year', y='freq_pos',color='key_word' ,title='Frequency of Keywords in the Tweets',width=500)