# EVALUATION

In [1]:
# import the necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pylab as pl  
import plotly.express as px

In [2]:
# import the necessary packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
from other import tools # tools.py

In [4]:
# clean the tweet and save it on a new column

def clean_tweet(tweet):

  new_tweet = ''
  words = tweet.split()
  for w in words:
    wl = w.lower() # word in lower case
    
    # replace numbers
    for number, new_value in tools.replace_numbers_dict.items():
      wl = str(wl).replace(number,new_value)

    # replace some punctuations mark to keep only words
    for punctuation, new_value in tools.replace_punctuation_dict.items():
      wl = wl.replace(punctuation,new_value)
    
    # replace accent mark 
    for accent, new_value in tools.replace_accent_dict.items():
      wl = wl.replace(accent,new_value)

    if wl not in tools.model_stop_wprds:
      new_tweet += f'{wl} '
  
  return new_tweet[:-1]

In [5]:
df_rating=pd.read_csv('https://raw.githubusercontent.com/GDLPLearning/Sentiment-Analysis-for-the-MDP/master/notebooks/Exploratory/data/sentiment.csv') # read the data
df = pd.read_csv('https://raw.githubusercontent.com/GDLPLearning/Sentiment-Analysis-for-the-MDP/master/notebooks/Exploratory/data/tweets_keywords_2019_2022_interim.csv') # read the data

In [6]:
df_rating['tweet'] = df_rating['full_text'].apply(clean_tweet) # clean the tweet and save it on a new column
df['tweet']=df['full_text'].apply(clean_tweet) # clean the tweet and save it on a new column

In [7]:
# split the data into train and test data
tweets_train, tweets_test = train_test_split(df_rating, test_size=0.2, random_state=0) # split the data into train and test data

In [8]:
vectorizer = CountVectorizer() # create a vectorizer
x_train_bow = vectorizer.fit_transform(tweets_train['tweet'])  # fit the vectorizer on the training data
y_train_bow = tweets_train['sentiment'] # get the sentiment column from the training data

In [9]:
# Training the model 
lr_model_all = LogisticRegression(C=1, solver="liblinear") # create a logistic regression model
lr_model_all.fit(x_train_bow, y_train_bow) # fit the model on the training data

LogisticRegression(C=1, solver='liblinear')

In [10]:
vectorizer = CountVectorizer() # create a vectorizer
x_train_bow = vectorizer.fit_transform(tweets_train['tweet']) # fit the vectorizer on the training data
x_predic_bow = vectorizer.transform(df['tweet']) # transform the test data

In [11]:
# Predicting the output
hist_pred_lr_prob = lr_model_all.predict_proba(x_predic_bow) # predict the output
hist_pred_lr_all = lr_model_all.predict(x_predic_bow) # predict the probability output

In [12]:
probabilities = [each[1] for each in hist_pred_lr_prob] # get the probability of the output

In [13]:
df['predicted_sentiment'] = hist_pred_lr_all # add the predicted sentiment to the dataframe
df['Predicted_probability'] = probabilities     # add the predicted probability to the dataframe

In [14]:
df

Unnamed: 0,full_text,key_word,date,hour,day,day_of_week,month,year,num_words,num_char,tweet,predicted_sentiment,Predicted_probability
0,Me acordé que a los colombianos de mi ex trab...,trabajo,2020-01-10 22:51:09+00:00,22,10,4,1,2020,16,89,acorde colombianos ex trabajo gustaba medellin...,1,0.552759
1,@JoseFBarriosG @VLADDO @Uber Como van a apoya...,trabajo,2020-01-10 22:06:30+00:00,22,10,4,1,2020,24,138,josefbarriosg vladdo uber apoyar uber disminuy...,1,0.577848
2,En Tierragro sabemos que la labor en el campo...,trabajo,2020-01-10 19:59:40+00:00,19,10,4,1,2020,44,269,tierragro sabemos labor campo implica tener bu...,0,0.491671
3,@seguridadmed PARA NADIE ES UN SECRETO GENERA...,trabajo,2020-01-10 19:17:54+00:00,19,10,4,1,2020,42,257,seguridadmed nadie secreto general mas facil c...,0,0.353633
4,@kmilohurtado_81 @SaqueLargoWin Tengo casi un...,trabajo,2020-01-10 18:09:24+00:00,18,10,4,1,2020,35,229,kmilohurtado saquelargowin tengo casi decena a...,0,0.398427
...,...,...,...,...,...,...,...,...,...,...,...,...,...
244497,@lumalo El desorden en todo lo relacionado a ...,movilidad,2022-05-21 15:39:13+00:00,15,21,5,5,2022,15,93,lumalo desorden relacionado movilidad transito...,0,0.412423
244498,#Movilidad ¡Taxista! Así rotará el pico y pla...,movilidad,2022-05-21 15:13:26+00:00,15,21,5,5,2022,22,141,movilidad taxista rotara pico placa medellin s...,0,0.330331
244499,Faltan días para dar inicio a nuestras giras ...,movilidad,2022-05-21 01:29:49+00:00,1,21,5,5,2022,33,212,faltan dar inicio nuestras giras presenciales ...,1,0.501993
244500,@sttmed señores secretaria de la movilidad de...,movilidad,2022-05-21 00:25:54+00:00,0,21,5,5,2022,50,248,sttmed señores movilidad ciudad medellin pregu...,0,0.418485


In [15]:
#df_sent.replace({'predicted_sentiment': {0: 'Negative', 1: 'Positive'}}, inplace=True)

In [16]:
df.head() 

Unnamed: 0,full_text,key_word,date,hour,day,day_of_week,month,year,num_words,num_char,tweet,predicted_sentiment,Predicted_probability
0,Me acordé que a los colombianos de mi ex trab...,trabajo,2020-01-10 22:51:09+00:00,22,10,4,1,2020,16,89,acorde colombianos ex trabajo gustaba medellin...,1,0.552759
1,@JoseFBarriosG @VLADDO @Uber Como van a apoya...,trabajo,2020-01-10 22:06:30+00:00,22,10,4,1,2020,24,138,josefbarriosg vladdo uber apoyar uber disminuy...,1,0.577848
2,En Tierragro sabemos que la labor en el campo...,trabajo,2020-01-10 19:59:40+00:00,19,10,4,1,2020,44,269,tierragro sabemos labor campo implica tener bu...,0,0.491671
3,@seguridadmed PARA NADIE ES UN SECRETO GENERA...,trabajo,2020-01-10 19:17:54+00:00,19,10,4,1,2020,42,257,seguridadmed nadie secreto general mas facil c...,0,0.353633
4,@kmilohurtado_81 @SaqueLargoWin Tengo casi un...,trabajo,2020-01-10 18:09:24+00:00,18,10,4,1,2020,35,229,kmilohurtado saquelargowin tengo casi decena a...,0,0.398427


In [17]:
df.groupby(['month','year','key_word','predicted_sentiment']).size().to_frame().reset_index().rename(columns={0: 'frequency'})

Unnamed: 0,month,year,key_word,predicted_sentiment,frequency
0,1,2019,cultura,0,189
1,1,2019,cultura,1,308
2,1,2019,empresa,0,273
3,1,2019,empresa,1,35
4,1,2019,jovenes,0,131
...,...,...,...,...,...
719,12,2021,tecnologia,1,82
720,12,2021,trabajo,0,386
721,12,2021,trabajo,1,574
722,12,2021,vida,0,568


In [18]:
df.head()

Unnamed: 0,full_text,key_word,date,hour,day,day_of_week,month,year,num_words,num_char,tweet,predicted_sentiment,Predicted_probability
0,Me acordé que a los colombianos de mi ex trab...,trabajo,2020-01-10 22:51:09+00:00,22,10,4,1,2020,16,89,acorde colombianos ex trabajo gustaba medellin...,1,0.552759
1,@JoseFBarriosG @VLADDO @Uber Como van a apoya...,trabajo,2020-01-10 22:06:30+00:00,22,10,4,1,2020,24,138,josefbarriosg vladdo uber apoyar uber disminuy...,1,0.577848
2,En Tierragro sabemos que la labor en el campo...,trabajo,2020-01-10 19:59:40+00:00,19,10,4,1,2020,44,269,tierragro sabemos labor campo implica tener bu...,0,0.491671
3,@seguridadmed PARA NADIE ES UN SECRETO GENERA...,trabajo,2020-01-10 19:17:54+00:00,19,10,4,1,2020,42,257,seguridadmed nadie secreto general mas facil c...,0,0.353633
4,@kmilohurtado_81 @SaqueLargoWin Tengo casi un...,trabajo,2020-01-10 18:09:24+00:00,18,10,4,1,2020,35,229,kmilohurtado saquelargowin tengo casi decena a...,0,0.398427


In [19]:
df_sent_freq=pd.DataFrame()
for year in df['year'].unique():
    df_cross=pd.crosstab(df[df['year']==year]['key_word'],df[df['year']==year]['predicted_sentiment']).reset_index()
    df_cross['total']=df_cross.sum(axis=1)
    df_cross['year']=year
    df_cross['porc_pos']=df_cross[1]/df_cross['total']
    df_cross=df_cross[['year','key_word','porc_pos']]
    df_sent_freq=df_sent_freq.append(df_cross)
df_sent_freq.reset_index(drop=True,inplace=True)
df_sent_freq=df_sent_freq.sort_values(by=['year','key_word'],ascending=True)
px.line(df_sent_freq, x='year', y='porc_pos',color='key_word' ,title='Frequency of Keywords in the Tweets',width=500)
    

In [34]:
def plotly_month_keyword(year,keyword):
    df_sent_freq=pd.DataFrame()
    df_plot=df[(df['year']==year)]
    for months in df_plot['month'].unique():
        df_cross=pd.crosstab(df_plot[df_plot['month']==months]['key_word'],df_plot[df_plot['month']==months]['predicted_sentiment']).reset_index()
        df_cross['total']=df_cross.sum(axis=1)
        df_cross['month']=months
        df_cross['porc_pos']=df_cross[1]/df_cross['total']
        df_cross=df_cross[['month','key_word','porc_pos']]
        df_sent_freq=pd.concat([df_sent_freq,df_cross])
    df_sent_freq.reset_index(drop=True,inplace=True)
    df_sent_freq=df_sent_freq.sort_values(by=['month','key_word'],ascending=True)
    fig=px.line(df_sent_freq[df_sent_freq['key_word'].isin(keyword)], 
        x='month', 
        y='porc_pos',
        color='key_word',
        markers=True,
        title='Frequency of Keywords in the Tweets',
        width=1000,
        height=600,
        color_discrete_sequence=px.colors.sequential.Turbo,
        labels={'porc_pos': '% Rate Positive Tweets', 'month': 'Months'},)
    fig.layout.paper_bgcolor = '#FFFFFF'
    fig.layout.plot_bgcolor = '#FFFFFF'
    fig.update_layout(title_font_size=15)     
    return fig 

In [35]:
plotly_month_keyword(2022,['cultura','vida','metro'])

In [38]:
def freq_year(year,keyword):
    df_year=df[(df['year'].isin(year))]
    df_count=df_year.groupby(['key_word','predicted_sentiment']).size().reset_index().rename(columns={0: 'frequency'})
    df_count.replace({'predicted_sentiment': {0: 'Negative', 1: 'Positive'}}, inplace=True)
    df_count=df_count[df_count['key_word'].isin(keyword)]
    fig=px.bar(df_count,
               x='key_word',
               y='frequency',
               color='predicted_sentiment',
               title=f'Frequency of tweets by sentiment {year}',
               barmode='group',
               text_auto='.2s',
               color_discrete_map={'Positive':'#5BC0BE', 'Negative':'#1C2541'},
               width=1000,
               height=600,
               labels={'key_word':'Keywords'})
    fig.update_traces(marker_line_width=1.5, opacity=0.7)
    fig.update_layout(title_font_size=15)
    fig.layout.paper_bgcolor = '#FFFFFF'
    fig.layout.plot_bgcolor = '#FFFFFF'
    return fig

In [40]:
freq_year([2019,2020,2021,2022],['vida','cultura','trabajo'])

In [42]:
#df[['full_text','key_word','date','month','year','predicted_sentiment','Predicted_probability']].to_csv('data/dataset_report.csv',index=False)