# Preparing the data

In [3]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.probability import FreqDist
import pandas as pd
import ast
import fs

In [17]:
PROCESSED_FILES_DIR = fs.open_fs("MCD-NLP-HPT/data/processed")
TRAIN_DIR = PROCESSED_FILES_DIR.getsyspath("train.csv")
TEST_DIR = PROCESSED_FILES_DIR.getsyspath("test.csv")
VAL_DIR = PROCESSED_FILES_DIR.getsyspath("validation.csv")

INTERIM_COMMENTS_CLEANED_ONLY_SW_DIR = INTERIM_FILES_DIR.getsyspath("Comments_cleaned_only_sw.csv")
TRAIN_ONLY_SW_DIR = PROCESSED_FILES_DIR.getsyspath("train_only_sw.csv")
TEST_ONLY_SW_DIR = PROCESSED_FILES_DIR.getsyspath("test_only_sw.csv")
VAL_ONLY_SW_DIR = PROCESSED_FILES_DIR.getsyspath("validation_only_sw.csv")

### Clase principal para el algoritmo

Recuerda que la clase más probable viene dada por (en espacio de cómputo logarítmico): 


$$\hat{c} = {\arg \max}_{(c)}\log{P(c)}
 +\sum_{i=1}^n
\log{ P(f_i \vert c)}
$$

Donde, para evitar casos atípicos, usaremos el suavizado de Laplace así:

$$
P(f_i \vert c) = \frac{C(f_i, c)+1}{C(c) + \vert V \vert}
$$

siendo $\vert V \vert$ la longitud del vocabulario de nuestro conjunto de entrenamiento. 

In [18]:
import math
from collections import defaultdict
from nltk.probability import FreqDist
import numpy as np


class NaiveBayesClassifier:
    def __init__(self):
        self.unique_classes = set()
        self.vocab = set()
        self.class_count = {}  # C(c)
        self.log_class_prior_prob = {}  # P(c)
        self.word_conditional_counts = defaultdict(lambda: defaultdict(float))  # C(w|c)

    def fit(self, x_data, y_data):
        """
        Train the Naive Bayes Classifier on the given data.
        :param x_data: List of tokenized texts.
        :param y_data: List of class labels corresponding to x_data.
        """
        num_samples = len(x_data)
        self.unique_classes = set(y_data)

        # Count occurrences of each class
        for cls in y_data:
            self.class_count[cls] = self.class_count.get(cls, 0) + 1

        # Compute log prior probabilities for each class
        for cls, count in self.class_count.items():
            self.log_class_prior_prob[cls] = math.log(count / num_samples)

        # Count word occurrences per class
        for tokens, cls in zip(x_data, y_data):
            counts = FreqDist(tokens)
            for word, count in counts.items():
                self.vocab.add(word)
                self.word_conditional_counts[cls][word] += count

    def predict(self, data):
        """
        Predict class labels for the given data.
        :param data: List of tokenized texts.
        :return: List of predicted class labels.
        """
        results = []

        for text in data:
            words = set(FreqDist(text))  # Unique words in the text
            score_prob = {}

            for cls in self.unique_classes:
                # Initialize with prior probability
                score_prob[cls] = self.log_class_prior_prob[cls]

                for word in words:
                    if word in self.vocab:
                        # Laplace smoothing
                        word_count = self.word_conditional_counts[cls].get(word, 0.0)
                        log_word_prob = math.log(
                            (word_count + 1) / (self.class_count[cls] + len(self.vocab))
                        )
                        score_prob[cls] += log_word_prob

            # Find the class with the highest score
            predicted_class = max(score_prob, key=score_prob.get)
            results.append(predicted_class)

        return results

In [7]:
df_train = pd.read_csv(TRAIN_DIR)
df_test = pd.read_csv(TEST_DIR)
df_val = pd.read_csv(VAL_DIR)

In [19]:
df_train_2 = pd.read_csv(TRAIN_ONLY_SW_DIR)
df_test_2 = pd.read_csv(TEST_ONLY_SW_DIR)
df_val_2 = pd.read_csv(VAL_ONLY_SW_DIR)

In [8]:
df_train

Unnamed: 0,id,videoId,textOriginal,authorDisplayName,likeCount,publishedAt,category_id,category_description,Tokens_full,Tokens,Tokens_without_stopwords,Tokens_without_stopwords_stemmed,Tokens_lemmatized,Tokens_without_stopwords_lemmatized
0,UgwtT3n4ZD6aX2c3NrF4AaABAg,FXxcNjTZ4qo,"No es un mito, Paraguay 🇵🇾 si existe.....",@rosaelenarolonespinosa4666,1,2024-04-18T13:32:34Z,7,Comentarios generales,"['No', 'es', 'un', 'mito', ',', 'Paraguay', 's...","['No', 'es', 'un', 'mito', 'Paraguay', 'si', '...","['No', 'mito', 'Paraguay', 'si', 'existe', '...']","['no', 'mit', 'paraguay', 'si', 'exist', '...']","['no', 'ser', 'uno', 'mito', ',', 'paraguay', ...","['no', 'mito', 'paraguay', 'si', 'existir', '...."
1,Ugzlbev9x6glV6YNpyN4AaABAg,CPCN1Lqzc2U,"Un podcast sobre la radio con DIANA URIBE, la ...",@KarimVG,0,2024-06-26T05:30:57Z,5,Felicitaciones y agradecimientos,"['Un', 'podcast', 'sobre', 'la', 'radio', 'con...","['Un', 'podcast', 'sobre', 'la', 'radio', 'con...","['Un', 'podcast', 'radio', 'DIANA', 'URIBE', '...","['un', 'podcast', 'radi', 'dian', 'urib', 'mej...","['uno', 'podcast', 'sobre', 'el', 'radio', 'co...","['uno', 'podcast', 'radio', 'diana', 'URIBE', ..."
2,UgzIKrDRdcGk13aMtul4AaABAg,ShW6FY-vbmo,"Cada vez que los escucho, muero de risa y me d...",@ceciliamendiola8646,2,2024-06-18T21:15:49Z,6,Comentarios humorísticos o memes,"['Cada', 'vez', 'que', 'los', 'escucho', ',', ...","['Cada', 'vez', 'que', 'los', 'escucho', 'muer...","['Cada', 'vez', 'escucho', 'muero', 'risa', 'd...","['cad', 'vez', 'escuch', 'muer', 'ris', 'dan',...","['cada', 'vez', 'que', 'él', 'escuchar', ',', ...","['cada', 'vez', 'escuchar', 'muero', 'risa', '..."
3,Ugz5Q5R5ls4pXTXSi294AaABAg,HxMXLWqe9HQ,"Que buen capitulo, son geniales! El cristo del...",@anaberthalyochoaporras8262,0,2024-09-19T04:47:09Z,5,Felicitaciones y agradecimientos,"['Que', 'buen', 'capitulo', ',', 'son', 'genia...","['Que', 'buen', 'capitulo', 'son', 'geniales',...","['Que', 'buen', 'capitulo', 'geniales', 'El', ...","['que', 'buen', 'capitul', 'genial', 'el', 'cr...","['que', 'buen', 'capitulo', ',', 'ser', 'genia...","['que', 'buen', 'capitulo', 'genial', 'el', 'c..."
4,UgwKlLDQnWVL_LOpE1d4AaABAg,wCm4FNSnDPs,"Gracias por el podcast de la semana, mis amore...",@LadySkywalkerW,0,2024-10-11T03:38:25Z,5,Felicitaciones y agradecimientos,"['Gracias', 'por', 'el', 'podcast', 'de', 'la'...","['Gracias', 'por', 'el', 'podcast', 'de', 'la'...","['Gracias', 'podcast', 'semana', 'amores', 'Lo...","['graci', 'podcast', 'seman', 'amor', 'los', '...","['gracia', 'por', 'el', 'podcast', 'de', 'el',...","['gracia', 'podcast', 'semana', 'amor', 'el', ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917,Ugx0EIHPfWZ1EV9dG4p4AaABAg,yFHCam_Q9j4,Me encantó todo. Gran episodio 🎉! ¡Felices 100...,@Cynthia_Gutierrez,0,2024-04-28T21:23:19Z,5,Felicitaciones y agradecimientos,"['Me', 'encantó', 'todo', '.', 'Gran', 'episod...","['Me', 'encantó', 'todo', 'Gran', 'episodio', ...","['Me', 'encantó', 'Gran', 'episodio', 'Felices...","['me', 'encant', 'gran', 'episodi', 'felic', '...","['yo', 'encantar', 'todo', '.', 'Gran', 'episo...","['yo', 'encantar', 'gran', 'episodio', 'Felice..."
918,UgwSryWi0MPvIPVHDop4AaABAg,7Jo3NR5lgd8,"Voy a investigar el conflicto de myanmar, ahor...",@antoniodejesus8241,15,2024-02-08T01:48:46Z,3,Experiencias personales,"['Voy', 'a', 'investigar', 'el', 'conflicto', ...","['Voy', 'a', 'investigar', 'el', 'conflicto', ...","['Voy', 'investigar', 'conflicto', 'myanmar', ...","['voy', 'investig', 'conflict', 'myanm', 'ahor...","['ir', 'a', 'investigar', 'el', 'conflicto', '...","['ir', 'investigar', 'conflicto', 'myanmar', '..."
919,Ugyn1Vg1Epb_ZoYbspB4AaABAg,FOFeh_vfcD8,A ver a qué hora sacan otro podcast ? Ya pasar...,@YamilOrtega-rl9vb,1,2024-07-15T09:50:33Z,1,Quejas o sugerencias de mejora,"['A', 'ver', 'a', 'qué', 'hora', 'sacan', 'otr...","['A', 'ver', 'a', 'qué', 'hora', 'sacan', 'otr...","['A', 'ver', 'hora', 'sacan', 'podcast', 'Ya',...","['a', 'ver', 'hor', 'sac', 'podcast', 'ya', 'p...","['a', 'ver', 'a', 'qué', 'hora', 'sacar', 'otr...","['a', 'ver', 'hora', 'sacar', 'podcast', 'ya',..."
920,Ugzu_MhkRKhOWF0SIk14AaABAg,yFHCam_Q9j4,Me encantó ver las fotos en el video de YouTub...,@veroaranda9618,0,2024-04-27T02:51:54Z,5,Felicitaciones y agradecimientos,"['Me', 'encantó', 'ver', 'las', 'fotos', 'en',...","['Me', 'encantó', 'ver', 'las', 'fotos', 'en',...","['Me', 'encantó', 'ver', 'fotos', 'video', 'Yo...","['me', 'encant', 'ver', 'fot', 'vide', 'youtub']","['yo', 'encantar', 'ver', 'el', 'foto', 'en', ...","['yo', 'encantar', 'ver', 'foto', 'vídeo', 'yo..."


In [23]:
df_train.columns

Index(['id', 'videoId', 'textOriginal', 'authorDisplayName', 'likeCount',
       'publishedAt', 'category_id', 'category_description', 'Tokens_full',
       'Tokens', 'Tokens_without_stopwords',
       'Tokens_without_stopwords_stemmed', 'Tokens_lemmatized',
       'Tokens_without_stopwords_lemmatized'],
      dtype='object')

Create dataframe using only the `Tokens_without_stopwords_lemmatized` column from the `df_train`, `df_test` and `df_val` dataframe.

In [14]:
import ast
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Función para evaluar de manera segura
def safe_eval(value):
    """
    Convierte un valor de texto en su equivalente literal de Python, si es necesario.
    Si ya es una lista, lo devuelve tal cual.
    """
    try:
        if isinstance(value, str):
            return ast.literal_eval(value)
        return value  # Si no es str, asume que ya es una lista
    except (ValueError, SyntaxError):
        print(f"Error parsing: {value}")
        return []  # Devuelve una lista vacía en caso de error

# Función para filtrar filas con listas no vacías
def filter_non_empty_rows(df, column):
    """
    Filtra un DataFrame para que solo contenga filas donde la columna especificada
    tiene listas no vacías.
    """
    return df[df[column].apply(lambda x: len(x) > 0)]

# Función para generar reportes detallados
def generate_classification_report(y_true, y_pred, dataset_name):
    """
    Genera y muestra un reporte detallado de métricas de clasificación.
    """
    print(f"\n=== Classification Report for {dataset_name} ===")
    print(classification_report(y_true, y_pred))
    print(f"Confusion Matrix for {dataset_name}:\n{confusion_matrix(y_true, y_pred)}")

# Función principal para procesar una columna y generar reportes
def process_column_with_reports(df_train, df_test, df_val, column, classifier_class):
    """
    Procesa una columna aplicando limpieza, filtrado y entrenamiento del modelo,
    y genera reportes detallados de clasificación para train, test y val.
    """
    # Limpieza
    df_train[column] = df_train[column].apply(safe_eval)
    df_test[column] = df_test[column].apply(safe_eval)
    df_val[column] = df_val[column].apply(safe_eval)
    
    # Filtrar filas con listas no vacías
    filtered_train_df = filter_non_empty_rows(df_train, column)
    filtered_test_df = filter_non_empty_rows(df_test, column)
    filtered_val_df = filter_non_empty_rows(df_val, column)
    
    # Dividir en X (features) e Y (target)
    x_train = filtered_train_df[column]
    y_train = filtered_train_df['category_id']
    x_test = filtered_test_df[column]
    y_test = filtered_test_df['category_id']
    x_val = filtered_val_df[column]
    y_val = filtered_val_df['category_id']
    
    # Crear y entrenar el clasificador
    classifier = classifier_class()
    classifier.fit(x_train, y_train)
    
    # Predecir para train, test y val
    y_train_pred = classifier.predict(x_train)
    y_test_pred = classifier.predict(x_test)
    y_val_pred = classifier.predict(x_val)
    
    # Generar reportes detallados
    generate_classification_report(y_train, y_train_pred, "Train")
    generate_classification_report(y_test, y_test_pred, "Test")
    generate_classification_report(y_val, y_val_pred, "Validation")
    
    return {
        "train_accuracy": (y_train == y_train_pred).mean(),
        "test_accuracy": (y_test == y_test_pred).mean(),
        "val_accuracy": (y_val == y_val_pred).mean(),
    }


In [24]:
cols = ['Tokens_full', 'Tokens', 'Tokens_without_stopwords', 'Tokens_without_stopwords_stemmed', 'Tokens_lemmatized',
       'Tokens_without_stopwords_lemmatized']

for col in cols:
    print(f"Processing column: {col}")
    accuracies = process_column_with_reports(df_train, df_test, df_val, col, NaiveBayesClassifier)
    print(f"Accuracy Summary for {col}: {accuracies}")


Processing column: Tokens_full

=== Classification Report for Train ===
              precision    recall  f1-score   support

           1       0.92      0.28      0.43        82
           2       0.87      0.89      0.88       151
           3       1.00      0.13      0.23        62
           4       0.52      0.99      0.68       115
           5       0.58      0.98      0.73       191
           6       0.93      0.76      0.84       206
           7       1.00      0.17      0.28       109

    accuracy                           0.70       916
   macro avg       0.83      0.60      0.58       916
weighted avg       0.81      0.70      0.66       916

Confusion Matrix for Train:
[[ 23   6   0  27  26   0   0]
 [  0 134   0   6  11   0   0]
 [  1   1   8  14  38   0   0]
 [  0   1   0 114   0   0   0]
 [  0   2   0   1 188   0   0]
 [  0   5   0  17  27 157   0]
 [  1   5   0  41  32  12  18]]

=== Classification Report for Test ===
              precision    recall  f1-score  

In [21]:
cols = ['Tokens_without_stopwords']

for col in cols:
    print(f"Processing column: {col}")
    accuracies = process_column_with_reports(df_train_2, df_test_2, df_val_2, col, NaiveBayesClassifier)
    print(f"Accuracy Summary for {col}: {accuracies}")


Processing column: Tokens_without_stopwords

=== Classification Report for Train ===
              precision    recall  f1-score   support

           1       0.87      0.41      0.55        81
           2       0.63      0.71      0.67       145
           3       0.90      0.31      0.46        61
           4       0.64      0.67      0.65       111
           5       0.50      0.95      0.65       195
           6       0.77      0.55      0.64       179
           7       1.00      0.19      0.32        80

    accuracy                           0.62       852
   macro avg       0.76      0.54      0.56       852
weighted avg       0.71      0.62      0.60       852

Confusion Matrix for Train:
[[ 33  15   0   3  26   4   0]
 [  0 103   1   7  31   3   0]
 [  2   4  19   4  30   2   0]
 [  0   9   0  74  21   7   0]
 [  0   7   0   1 185   2   0]
 [  1  14   0  12  54  98   0]
 [  2  12   1  14  24  12  15]]

=== Classification Report for Test ===
              precision    recal

In [26]:
def collect_accuracies(df_train, df_test, df_val, cols, classifier_class):
    """
    Procesa cada columna, entrena y evalúa el modelo, y almacena las precisiones en un DataFrame.
    
    Parameters:
        df_train, df_test, df_val: DataFrames de entrenamiento, prueba y validación.
        cols: Lista de columnas a procesar.
        classifier_class: Clase del modelo de clasificación a usar.
        
    Returns:
        results_df: DataFrame con las precisiones para cada columna.
    """
    results = []

    for col in cols:
        print(f"Processing column: {col}")
        accuracies = process_column_with_reports(df_train, df_test, df_val, col, classifier_class)
        results.append({
            "Column": col,
            "Train Accuracy": accuracies["train_accuracy"],
            "Test Accuracy": accuracies["test_accuracy"],
            "Validation Accuracy": accuracies["val_accuracy"]
        })
    
    results_df = pd.DataFrame(results)
    return results_df

In [27]:
results_df = collect_accuracies(df_train, df_test, df_val, cols, NaiveBayesClassifier)
results_df

Processing column: Tokens_full

=== Classification Report for Train ===
              precision    recall  f1-score   support

           1       0.92      0.28      0.43        82
           2       0.87      0.89      0.88       151
           3       1.00      0.13      0.23        62
           4       0.52      0.99      0.68       115
           5       0.58      0.98      0.73       191
           6       0.93      0.76      0.84       206
           7       1.00      0.17      0.28       109

    accuracy                           0.70       916
   macro avg       0.83      0.60      0.58       916
weighted avg       0.81      0.70      0.66       916

Confusion Matrix for Train:
[[ 23   6   0  27  26   0   0]
 [  0 134   0   6  11   0   0]
 [  1   1   8  14  38   0   0]
 [  0   1   0 114   0   0   0]
 [  0   2   0   1 188   0   0]
 [  0   5   0  17  27 157   0]
 [  1   5   0  41  32  12  18]]

=== Classification Report for Test ===
              precision    recall  f1-score  

Unnamed: 0,Column,Train Accuracy,Test Accuracy,Validation Accuracy
0,Tokens_full,0.700873,0.48913,0.272727
1,Tokens,0.729258,0.478261,0.272727
2,Tokens_without_stopwords,0.884279,0.5,0.363636
3,Tokens_without_stopwords_stemmed,0.760917,0.543478,0.454545
4,Tokens_lemmatized,0.633406,0.445652,0.363636
5,Tokens_without_stopwords_lemmatized,0.786026,0.51087,0.363636


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=2820e488-6f1b-466d-af14-a66826f012e3' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>