In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns

### Functions

In [None]:
def contains_only_digits(s):
    """
    Verifica si una cadena contiene únicamente dígitos utilizando expresiones regulares.
    
    Args:
    s (str): La cadena a evaluar.
    
    Returns:
    bool: True si la cadena contiene únicamente dígitos, False de lo contrario.
    """
    # Define el patrón de la expresión regular para buscar solo dígitos
    pattern = r'^\d+$'
    
    # Utiliza la función match() de la biblioteca re para buscar el patrón en la cadena
    # La función match() busca el patrón desde el principio de la cadena
    # Devuelve un objeto Match si el patrón coincide, None de lo contrario
    match = re.match(pattern, s)
    
    # Si match no es None, significa que el patrón coincide y la cadena contiene solo dígitos
    return match is not None

### Load data

In [None]:
df = pd.read_pickle("../data/preprocessed/stemm_lemm_stop_words.pkl")

In [None]:
df.head(1)

In [None]:
df[['Likes','Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Care']].describe()

It seems that, **"Love", "Wow", "Haha", "Sad", "Angry", "Care"** does not have information only 25% of each interaction have data (NO ZERO).

- People does not use that button interactions!

# Total Interactions

- **¿?** `Total Interactions` = `Likes` + `Comments` + `Shares` + `Love` + `Wow` + `Haha` + `Sad` + `Angry` + `Care`

In [None]:
tot_inter_hipotesis = []
for i in df.iloc[:,2:12].iterrows():
    if contains_only_digits(i[1].values[0]):
        tot_int = int(i[1].values[0])
        sum_int = sum(i[1].values[1:])
        tot_inter_hipotesis.append(tot_int == sum_int)

In [None]:
all(tot_inter_hipotesis)

# Correlation between interactions

In [None]:
df.columns

In [None]:
corr_matrix = df[['Likes','Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Care']].corr()

In [None]:
plt.figure(figsize=(8, 5))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlación interacciones')
plt.show()

Seems that there is no correlation between variables, why?
- When does data was recopilated? When does interactions "Love", "Wow", "Haha", "Sad", "Angry", "Care" began?
- Does those interections are measure after they were created? I mean, are we measuring "Angry" when that option ever exists?

In [None]:
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x="Likes", y="Angry")
plt.yscale('log')
plt.xscale('log')
plt.show()

In [None]:
g = sns.FacetGrid(pd.melt(df[['Likes','Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Care']]), col='variable', col_wrap=3, sharex=False, sharey=False)
g.map(sns.boxplot, 'value')
g.set(xscale='log')
g.set_titles('{col_name}')

Due to, **"Love", "Wow", "Haha", "Sad", "Angry", "Care"** has minimum information I will analyze **"Likes"**

# Likes

In [None]:
plt.figure(figsize=(8, 6))
plt.hist(df["Likes"], bins=20, color='skyblue', edgecolor='black')
plt.title(f'Histograma de la columna Likes')
plt.xlabel("Likes")
plt.ylabel('log Frequency')
plt.yscale('log')
plt.grid(True)
plt.show()