In [None]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
sns.set(style="whitegrid",
        rc={
            "text.color": "black",
            "axes.labelcolor": "black",
            "axes.edgecolor": "black",
            "xtick.color": "black",
            "ytick.color": "black",
            "axes.facecolor": "#FFFFFF",
            "figure.facecolor": "#F5F8FA"}
        )

In [None]:
df=pd.read_pickle("final_df.pkl").reset_index(drop=True)

In [None]:
df[["tweet","usuario","dispositivo"]].describe()

In [None]:
round(df["hashtags"].apply(len).mean(),0)

In [None]:
round(df["users"].apply(len).mean(),0)

In [None]:
df["dispositivo"].value_counts()[:10]

In [None]:
hashtags=[]
for i in df["hashtags"]:
    for j in i:
        hashtags.append(j)


In [None]:
join_hash=(",").join(hashtags).lower()

In [None]:
reg_amlo=re.compile(r"#amlo,")
clean_hash=reg_amlo.sub("",join_hash).split(",")

In [None]:
clean_hash.pop(67417)

In [None]:
hashtag_series=pd.Series(clean_hash)

In [None]:
hashtag_series.nunique()

In [None]:
top_hash=hashtag_series.value_counts().head(30)

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=top_hash.values, y=top_hash.index, palette="GnBu_r",lw=1,edgecolor="black")
plt.yticks(size=20,weight="light")
plt.xticks(size=20,weight="light")
plt.xlabel("Número de tweets",size=22,labelpad=20,weight="bold")
plt.title("Hashtags más usados",weight="bold",pad=15,size=25)
plt.tight_layout()

In [None]:
mentions=[]
for i in df["users"]:
    for j in i:
        mentions.append(j)

In [None]:
mentions_series=pd.Series(mentions)

In [None]:
mentions_series.nunique()

In [None]:
top_mentions=mentions_series.value_counts().head(30)

In [None]:
plt.figure(figsize=(20,10))
sns.barplot(x=top_mentions.values, y=top_mentions.index, palette="GnBu_r",lw=1,edgecolor="black")
plt.yticks(size=20,weight="light")
plt.xticks(ticks=np.arange(0,6000,500),size=20,weight="light")
plt.xlabel("Número de tweets",size=22,labelpad=20,weight="bold")
plt.title("Usuarios más mencionados",weight="bold",pad=15,size=25)
plt.tight_layout()

In [None]:
from nltk.corpus import stopwords
stop=stopwords.words('spanish')

In [None]:
def quitar_acentos(texto):
    juntar=(" ").join(texto)
    juntar=juntar.replace("á","a")
    juntar=juntar.replace("é","e")
    juntar=juntar.replace("í","i")
    juntar=juntar.replace("ó","o")
    juntar=juntar.replace("ú","u")
    lista_final=juntar.split()
    return lista_final
    

In [None]:
stop=quitar_acentos(stop)

In [None]:
def texto_a_palabras(texto):
    return [palabra for palabra in texto.split() if palabra not in stop]

In [None]:
df["tokens"]=df["tweet"].apply(texto_a_palabras)

In [None]:
conteo={}
for i in df["tokens"]:
    for palabra in i:
        if palabra in conteo:
            conteo[palabra] += 1
        else:
            conteo[palabra] = 1

In [None]:
conteo_palabras=pd.Series(conteo)

In [None]:
conteo_palabras.sort_values(ascending=False).head(30)

In [None]:
lista_tokens=[]
for i in df["tokens"]:
    for palabra in i:
        lista_tokens.append(palabra)

In [None]:
palabras=(" ").join(lista_tokens)

In [None]:
wordcloud = WordCloud(height=3000,width=5500,background_color="white",max_words=500).generate(palabras)
plt.figure(figsize=(25,15))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

In [None]:
with open("positive_words_es.txt", "r", encoding="utf-8") as f:
    palabras_positivas = f.read().splitlines()

with open("negative_words_es.txt", "r", encoding="utf-8") as f:
    palabras_negativas = f.read().splitlines()
    
with open("groserías.txt", "r", encoding="utf-8") as f:
    groserias = f.read().splitlines() 

In [None]:
palabras_positivas=quitar_acentos(palabras_positivas)
palabras_negativas=quitar_acentos(palabras_negativas)

In [None]:
len(palabras_positivas)

In [None]:
len(palabras_negativas)

In [None]:
len(groserias)

In [None]:
conteo_palabras.count()

In [None]:
mes_contador=df.groupby(["mes","dia"])["tweet"].count()

In [None]:
mes_contador.mean()

In [None]:
hora_contador=df.groupby("hora")["tweet"].count()

In [None]:
figure=plt.figure(figsize=(20,10))
ax1= figure.add_subplot(1,2,1)
ax2=figure.add_subplot(1,2,2,sharey = ax1)
sns.barplot(x=df["fecha_str"].unique(),color="#1DA1F2",y=mes_contador.values,lw=1,edgecolor="black",orient="v",ax=ax1)
ax1.tick_params(axis="x", labelsize=20,labelrotation=90)
ax1.tick_params(axis="y", labelsize=20)
ax1.set_title("Tweets por día",fontsize=25,fontweight="bold",pad=20)
ax1.set_xlabel("Fecha",fontsize=20,fontweight="bold",labelpad=20)
ax1.set_ylabel("Número de tweets",fontsize=20,fontweight="bold",labelpad=20)
sns.barplot(x=hora_contador.index,color="#1DA1F2",y=hora_contador.values,lw=1,edgecolor="black",orient="v",ax=ax2)
plt.setp(ax2.get_yticklabels(), visible=False)
ax2.tick_params(axis="x", labelsize=20)
ax2.tick_params(axis="y", labelsize=20)
ax2.set_title("Tweets acumulados por hora",fontsize=25,fontweight="bold",pad=20)
ax2.set_xlabel("Tiempo del centro, horario de verano (UTC -5)",fontsize=20,fontweight="bold",labelpad=20)
figure.tight_layout()
plt.subplots_adjust(wspace = 0.05)


In [None]:
def puntaje(texto):
    score=0
    for i in texto:
        if i in palabras_positivas:
            score+=1
        elif i in palabras_negativas:
            score-=1
    return score

In [None]:
def contador_groserias(texto):
    score_groserias=0
    for i in texto:
        if i in groserias:
            score_groserias+=1
    return score_groserias

In [None]:
df["puntaje"]=df["tokens"].apply(puntaje)

In [None]:
df["conteo_groserias"]=df["tokens"].apply(contador_groserias)

In [None]:
def multiplicador(punt,gros):
    if gros==1:
        puntaje=punt*2
    elif gros>1:
        puntaje=punt*3
    else:
        puntaje=punt
    return puntaje
    

In [None]:
df["puntaje_final"]=df.apply(lambda x: multiplicador(x["puntaje"],x["conteo_groserias"]),axis=1)

In [None]:
df_puntaje=df[df["puntaje_final"]!=0]

In [None]:
df_puntaje["sentimiento"]=df["puntaje_final"].apply(lambda x: "positivo" if x>0 else "negativo")

In [None]:
df_puntaje["sentimiento"].value_counts()

In [None]:
df_sent=df_puntaje.groupby(["mes","dia","sentimiento"]).count()["tweet"]

In [None]:
df_sent.xs("positivo",level=2).plot(lw=2,ls="--",color="green",figsize=(20,10),label="Positivo")
df_sent.xs("negativo",level=2).plot(lw=2,ls="--",color="red",figsize=(20,10),label="Negativo")
plt.xticks(ticks=np.arange(0, df_puntaje["fecha_str"].nunique()),labels=df_puntaje["fecha_str"].unique(),size=20,rotation=90)
plt.yticks(np.arange(0, 851 , 50),size=20)
plt.title("Sentimiento de los tweets",weight="bold",pad=15,size=22)
plt.xlabel("Fecha",weight="bold",labelpad=20,size=18)
plt.ylabel("Número de tweets",weight="bold",labelpad=20,size=18)
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5),fontsize="x-large")
plt.tight_layout()

In [None]:
df_suma=df_puntaje.groupby(["mes","dia"])["puntaje"].sum()

In [None]:
def indice(x):
    if x>0:
        valor=x/df_suma.max()
    if x<0:
        valor=-(x/df_suma.min())
    return valor

In [None]:
df_indice=df_suma.apply(indice)

In [None]:
df_indice.plot(lw=2,ls="--",color="red",figsize=(20,10))
plt.xticks(ticks=np.arange(0, df_puntaje["fecha_str"].nunique()),labels=df_puntaje["fecha_str"].unique(),size=15,rotation=90)
plt.title("Sentimiento general diario de los tweets",weight="bold",pad=15,size=22)
plt.xlabel("Fecha",weight="bold",labelpad=20,size=18)
plt.yticks(size=15)
plt.ylabel("Índice",weight="bold",labelpad=20,size=18)