In [None]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import sys
import seaborn as sns
sys.path.insert(0, '/Users/eduardomorenoortiz/Desktop/ITAM/nanook/nlp_nanook/src') # Local
from utils.utils import formato_abreviado, generate_N_grams, clean_text, get_corpus_N_gram
from matplotlib.ticker import FuncFormatter
from collections import Counter

# Exploratory Data Analysis (EDA)

## Read and preprocess data

In [None]:
data = pd.read_csv('../../data/raw/facebook_pages/CC_FBpages_2017.csv')

In [None]:
df_selected = data[['Page Name', 'Facebook Id', 'Likes at Posting', 'Post Created', 'Type', 
                    'Total Interactions', 'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry', 'Care', 'Message', 'Image Text', 'Link Text', 'Description', 
                    'Overperforming Score (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )']]

In [None]:
df_selected.head()

In [None]:
df_selected.shape

Drop *nan* inside `Message` column

In [None]:
print(f"Cantidad de NAN: {sum([1 if pd.isna(x) else 0 for x in df_selected['Message']])}")
print(f"Cantidad de NAN (%): {round(sum([1 if pd.isna(x) else 0 for x in df_selected['Message']])/df_selected.shape[0]*100, 5)}%")
df_clean = df_selected.dropna(subset=['Message'])

Show some messages

In [None]:
random.seed(14112023)
for i in random.sample(range(0, df_clean.shape[0]), 10):
    print(df_clean['Message'][i])
    print('\n')

### Length of Messages

In [None]:
df_clean['messageChar_length'] = [x if pd.isna(x) else len(x) for x in df_clean['Message']]
df_clean['messageWords_length'] = [x if pd.isna(x) else len(x.split(' ')) for x in df_clean['Message']]

In [None]:
df_clean['messageChar_length'].plot(bins=150, kind='hist')

It is curious that there are "a lot" of messages with length aproximatly to $5,000$

In [None]:
df_clean['messageChar_length'].describe()

In [None]:
df_clean[df_clean['messageChar_length'] == 4999]['Message'][89]

In [None]:
for x in df_clean[df_clean['messageChar_length'] == 4999]['Message']:
    print(x[-20:])

Seems that facebook messages/publications are truncated to $4,999$ characters!

In [None]:
#plt.hist(df_clean['Message'].to_numpy(), bins=150, color='#E24A33', alpha=0.7)
#plt.hist(df_clean['Message'])

In [None]:
#sns.histplot(data=df_selected, x='Message', bins=150, kde=False, color='#E24A33')
#sns.histplot(data=df_selected, x='Message')

In [None]:
df_clean['messageWords_length'].plot(bins=100, kind='hist')

## N-grams

### Uni-gram

#### Raw

First, let's analyze text from **raw** (without any changes)

In [None]:
corpus = get_corpus_N_gram(list_text=df_clean['Message'], stop_words=None, ngram=1, show_plot=True)

Those are meaningless words, all of them (and similar) will be added to **stopwords**, so they won't be use for future analysis. 

#### With stop-words

In [None]:
with open('../utils/spanish_stopwords.txt', 'r') as archivo:
    stop_words = [linea.strip() for linea in archivo]
len(stop_words)

In [None]:
stop_words[:10]

In [None]:
corpus = get_corpus_N_gram(list_text=df_clean['Message'], stop_words=stop_words, ngram=1, show_plot=True)

- Upper/Lower-case must be removed, so that "*Cambio*" and "*cambio*" are counted in the same word.
- It is suggested by the autor to remove accents
- No include punctiations signs: **, ; . : '**
- No include URLs inside text

#### Stop-words, text to lowercase, no URLs: clean text

In [None]:
df_clean['Message_clean'] = df_clean['Message'].apply(lambda x: clean_text(x))

In [None]:
corpus = get_corpus_N_gram(list_text=df_clean['Message_clean'], stop_words=stop_words, ngram=1, show_plot=True)

### Bi-gram

In [None]:
corpus = get_corpus_N_gram(list_text=df_clean['Message_clean'], stop_words=stop_words, ngram=2, show_plot=True)

### Tri-gram

In [None]:
corpus = get_corpus_N_gram(df_clean['Message_clean'], stop_words=stop_words, ngram=3, show_plot=True)

In [None]:
df_clean.to_pickle('../../data/preprocessed/clean_text_nanook.pkl')