# PART C

In [10]:
import pandas as pd
import numpy as np
from collections import Counter
from math import log
from sklearn.model_selection import StratifiedKFold

def cargar_dataset(ruta_csv, nrows):
    df = pd.read_csv(ruta_csv, sep=';', skiprows=1, header=None, names=["id", "text", "date", "label"], low_memory=False, nrows=nrows)
    df["label"] = pd.to_numeric(df["label"], errors='coerce') 
    df = df.dropna(subset=["text", "label"]).reset_index(drop=True)
    df["label"] = df["label"].astype(int)
    return df

def construir_diccionario(train_df, min_freq=1):
    pos_texts = train_df[train_df['label'] == 1]['text']
    neg_texts = train_df[train_df['label'] == 0]['text']

    word_counts_pos = Counter()
    word_counts_neg = Counter()

    for text in pos_texts:
        word_counts_pos.update(text.split())

    for text in neg_texts:
        word_counts_neg.update(text.split())

    if min_freq > 1:
        word_counts_pos = Counter({k: v for k, v in word_counts_pos.items() if v >= min_freq})
        word_counts_neg = Counter({k: v for k, v in word_counts_neg.items() if v >= min_freq})

    total_tweets = len(train_df)
    p_pos = len(pos_texts) / total_tweets
    p_neg = len(neg_texts) / total_tweets

    vocab = set(word_counts_pos.keys()).union(set(word_counts_neg.keys()))
    vocab_size = len(vocab)

    stats = {
        'word_counts_pos': word_counts_pos,
        'word_counts_neg': word_counts_neg,
        'p_pos': p_pos,
        'p_neg': p_neg,
        'vocab_size': vocab_size
    }

    # Precomputar log-probabilidades
    total_words_pos = sum(word_counts_pos.values())
    total_words_neg = sum(word_counts_neg.values())
    stats['total_words_pos'] = total_words_pos
    stats['total_words_neg'] = total_words_neg

    log_prob_pos_dict = {w: log(count / total_words_pos) for w, count in word_counts_pos.items()}
    log_prob_neg_dict = {w: log(count / total_words_neg) for w, count in word_counts_neg.items()}

    stats['log_prob_pos_dict'] = log_prob_pos_dict
    stats['log_prob_neg_dict'] = log_prob_neg_dict

    return stats

def predecir_tweet_optimizado(tweet, stats):
    log_prob_pos = log(stats['p_pos'])
    log_prob_neg = log(stats['p_neg'])

    log_prob_pos_dict = stats['log_prob_pos_dict']
    log_prob_neg_dict = stats['log_prob_neg_dict']

    words = tweet.split()
    for word in words:
        w_log_pos = log_prob_pos_dict.get(word, float('-inf'))
        w_log_neg = log_prob_neg_dict.get(word, float('-inf'))

        if log_prob_pos != float('-inf'):
            log_prob_pos = log_prob_pos + w_log_pos if w_log_pos != float('-inf') else float('-inf')
        if log_prob_neg != float('-inf'):
            log_prob_neg = log_prob_neg + w_log_neg if w_log_neg != float('-inf') else float('-inf')

        if log_prob_pos == float('-inf') and log_prob_neg == float('-inf'):
            break

    return 1 if log_prob_pos > log_prob_neg else 0

def evaluar_modelo_optimizado(test_df, stats):
    y_pred = test_df['text'].apply(lambda tweet: predecir_tweet_optimizado(tweet, stats))
    y_true = test_df['label'].values
    accuracy = np.mean(y_pred == y_true)
    return accuracy

def k_fold_evaluation(df, k=5):
    X = df['text'].values
    y = df['label'].values

    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    fold_results = []
    best_accuracy = -1
    best_fold = None

    fold_num = 1
    for train_index, val_index in skf.split(X, y):
        train_df = df.iloc[train_index].reset_index(drop=True)
        val_df = df.iloc[val_index].reset_index(drop=True)

        stats = construir_diccionario(train_df)
        acc = evaluar_modelo_optimizado(val_df, stats)

        fold_results.append(acc)
        print(f"Fold {fold_num}, Accuracy: {acc:.4f}")

        if acc > best_accuracy:
            best_accuracy = acc
            best_fold = fold_num

        fold_num += 1

    print(f"\nResultados K-Fold ({k} folds):")
    print(f"Mejor fold: {best_fold} con accuracy: {best_accuracy:.4f}")
    print(f"Accuracy promedio: {np.mean(fold_results):.4f}")

    return best_fold, best_accuracy, fold_results

# Main
if __name__ == "__main__":
    ruta_csv = "./CRI_Practica3/FinalStemmedSentimentAnalysisDataset.csv"
    df = cargar_dataset(ruta_csv, None)
    print("Ejemplo de datos:\n", df.head())

    # Ahora usamos k-fold en vez de una sola partición. Al ser un dataset muy grande es conveniente darle un valor a k generoso (se puede variar).
    k = 10
    best_fold, best_accuracy, fold_results = k_fold_evaluation(df, k=k)


Ejemplo de datos:
    id                                               text        date  label
0   1                       is so sad for my apl friend   04/03/2015      0
1   2                         i miss the new moon trail   06/10/2015      0
2   3                              omg it already 730 o   03/04/2015      1
3   4   omgag im sooo im gunn cry i've been at thi de...  13/11/2015      0
4   5                     i think mi bf is che on me tt   10/08/2015      0
Fold 1, Accuracy: 0.6642
Fold 2, Accuracy: 0.6662
Fold 3, Accuracy: 0.6645
Fold 4, Accuracy: 0.6657
Fold 5, Accuracy: 0.6655
Fold 6, Accuracy: 0.6647
Fold 7, Accuracy: 0.6653
Fold 8, Accuracy: 0.6669
Fold 9, Accuracy: 0.6648
Fold 10, Accuracy: 0.6645

Resultados K-Fold (10 folds):
Mejor fold: 8 con accuracy: 0.6669
Accuracy promedio: 0.6652


# PART B

In [33]:
def dividir_train_test(df, test_size=0.2, seed=42):
    from sklearn.model_selection import train_test_split
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=seed, stratify=df['label'])
    return train_df.reset_index(drop=True), test_df.reset_index(drop=True)

# Estrategia 1: Aumentar el tamaño del conjunto de entrenamiento (sin Laplace en este ejemplo)
def estrategia_1(df, incrementos):
    print("\nEstrategia 1: Aumentar tamaño de entrenamiento (diccionario variable)")
    resultados = []
    for n in incrementos:
        # Aquí reaprovechamos las funciones de la Parte C
        train_df, test_df = dividir_train_test(df[:n])
        stats = construir_diccionario(train_df)  # USANDO la función de la Parte C
        acc = evaluar_modelo_optimizado(test_df, stats)  # USANDO la función de la Parte C
        resultados.append((n, acc))
        print(f"Tamaño Train: {n}, accuracy: {acc:.4f}")
    return resultados

# Estrategia 2: Modificar tamaño del diccionario manteniendo el train fijo
def estrategia_2(df, min_freqs):
    print("\nEstrategia 2: Modificar tamaño del diccionario")
    resultados = []
    train_df, test_df = dividir_train_test(df)
    for mf in min_freqs:
        # USANDO construir_diccionario y evaluar_modelo_optimizado de la Parte C
        stats = construir_diccionario(train_df, min_freq=mf)
        acc = evaluar_modelo_optimizado(test_df, stats)
        resultados.append((mf, acc))
        print(f"Frecuencia mínima: {mf}, accuracy: {acc:.4f}")
    return resultados

# Estrategia 3: Mantener el diccionario fijo y variar el tamaño del conjunto de entrenamiento
def estrategia_3(df, train_sizes):
    print("\nEstrategia 3: Modificar tamaño del conjunto de entrenamiento (diccionario fijo)")
    train_df, test_df = dividir_train_test(df, test_size=0.2)
    # Construir diccionario con el train completo (diccionario fijo)
    stats_fijo = construir_diccionario(train_df)
    full_vocab = set(stats_fijo['log_prob_pos_dict'].keys()).union(set(stats_fijo['log_prob_neg_dict'].keys()))
    vocab_size = stats_fijo['vocab_size']

    resultados = []
    for size in train_sizes:
        train_subset = train_df.iloc[:size].reset_index(drop=True)

        # Recalcular frecuencias con vocabulario fijo:
        from collections import Counter
        pos_texts = train_subset[train_subset['label'] == 1]['text']
        neg_texts = train_subset[train_subset['label'] == 0]['text']

        word_counts_pos = Counter()
        word_counts_neg = Counter()

        for text in pos_texts:
            words = [w for w in text.split() if w in full_vocab]
            word_counts_pos.update(words)
        for text in neg_texts:
            words = [w for w in text.split() if w in full_vocab]
            word_counts_neg.update(words)

        total_tweets = len(train_subset)
        p_pos = len(pos_texts)/total_tweets if total_tweets > 0 else 0
        p_neg = len(neg_texts)/total_tweets if total_tweets > 0 else 0

        # Crear stats con diccionario fijo
        stats_subset = {
            'word_counts_pos': word_counts_pos,
            'word_counts_neg': word_counts_neg,
            'p_pos': p_pos,
            'p_neg': p_neg,
            'vocab_size': vocab_size,
        }

        total_words_pos = sum(word_counts_pos.values())
        total_words_neg = sum(word_counts_neg.values())

        if total_words_pos > 0:
            log_prob_pos_dict = {w: (np.log(count/total_words_pos) if count>0 else float('-inf')) for w, count in word_counts_pos.items()}
        else:
            # Si no hay palabras, todos -inf
            log_prob_pos_dict = {w: float('-inf') for w in full_vocab}

        if total_words_neg > 0:
            log_prob_neg_dict = {w: (np.log(count/total_words_neg) if count>0 else float('-inf')) for w, count in word_counts_neg.items()}
        else:
            log_prob_neg_dict = {w: float('-inf') for w in full_vocab}

        stats_subset['log_prob_pos_dict'] = log_prob_pos_dict
        stats_subset['log_prob_neg_dict'] = log_prob_neg_dict

        acc = evaluar_modelo_optimizado(test_df, stats_subset)  # Uso de evaluar_modelo_optimizado de la Parte C
        resultados.append((size, acc))
        print(f"Tamaño Train: {size}, accuracy: {acc:.4f}")
    return resultados

# Main (Parte B)
if __name__ == "__main__":
    # Reutilizamos cargar_dataset de la Parte C
    df = cargar_dataset("./CRI_Practica3/FinalStemmedSentimentAnalysisDataset.csv", None)

    # Ahora solo llamamos a las estrategias usando funciones de la Parte C
    incrementos = [20000, 40000, 60000, 80000]
    estrategia_1(df, incrementos)

    min_freqs = [1, 3, 5, 10]
    estrategia_2(df, min_freqs)

    train_sizes = [20000, 40000, 60000, 80000]
    estrategia_3(df, train_sizes)



Estrategia 1: Aumentar tamaño de entrenamiento (diccionario variable)
Tamaño Train: 20000, accuracy: 0.5915
Tamaño Train: 40000, accuracy: 0.5641
Tamaño Train: 60000, accuracy: 0.5598
Tamaño Train: 80000, accuracy: 0.5528

Estrategia 2: Modificar tamaño del diccionario
Frecuencia mínima: 1, accuracy: 0.6626
Frecuencia mínima: 3, accuracy: 0.6238
Frecuencia mínima: 5, accuracy: 0.6075
Frecuencia mínima: 10, accuracy: 0.5873

Estrategia 3: Modificar tamaño del conjunto de entrenamiento (diccionario fijo)
Tamaño Train: 20000, accuracy: 0.5473
Tamaño Train: 40000, accuracy: 0.5631
Tamaño Train: 60000, accuracy: 0.5724
Tamaño Train: 80000, accuracy: 0.5797


# PART A

In [36]:
import pandas as pd
import numpy as np
from collections import Counter
from math import log

def predecir_tweet(tweet, stats, alpha=1):
    log_prob_pos = log(stats['p_pos'])
    log_prob_neg = log(stats['p_neg'])
    vocab_size = stats['vocab_size']
    words = tweet.split()

    words_pos = stats['word_counts_pos']
    words_neg = stats['word_counts_neg']

    total_words_pos = sum(words_pos.values())
    total_words_neg = sum(words_neg.values())

    for word in words:
        p_w_pos = (words_pos.get(word, 0) + alpha) / (total_words_pos + alpha * vocab_size)
        p_w_neg = (words_neg.get(word, 0) + alpha) / (total_words_neg + alpha * vocab_size)
        log_prob_pos += log(p_w_pos)
        log_prob_neg += log(p_w_neg)

    return 1 if log_prob_pos > log_prob_neg else 0

def evaluar_modelo(test_df, stats, alpha=1):
    y_pred = test_df['text'].apply(lambda tweet: predecir_tweet(tweet, stats, alpha=alpha))
    y_true = test_df['label'].values
    return np.mean(y_pred == y_true)

# Estrategia 1 con Laplace Smoothing: Aumentar el tamaño de entrenamiento
def estrategia_1_laplace(df, incrementos, alpha=1):
    print("\n[Estrategia 1 - Parte A]: Aumentar tamaño de entrenamiento con Laplace Smoothing")
    for n in incrementos:
        train_df, test_df = dividir_train_test(df[:n])
        stats = construir_diccionario(train_df)
        acc = evaluar_modelo(test_df, stats, alpha=alpha)
        print(f"Tamaño Train: {n}, accuracy: {acc:.4f}")

# Estrategia 2 con Laplace Smoothing: Fijar train, variar tamaño del diccionario
def estrategia_2_laplace(df, min_freqs, alpha=1):
    print("\n[Estrategia 2 - Parte A]: Variar tamaño del diccionario con Laplace Smoothing (train fijo)")
    train_df, test_df = dividir_train_test(df)
    for min_freq in min_freqs:
        stats = construir_diccionario(train_df, min_freq=min_freq)
        acc = evaluar_modelo(test_df, stats, alpha=alpha)
        print(f"Frecuencia mínima: {min_freq}, accuracy: {acc:.4f}")

# Estrategia 3 con Laplace Smoothing: Fijar diccionario, variar tamaño del entrenamiento
def estrategia_3_laplace(df, train_sizes, alpha=1):
    print("\n[Estrategia 3 - Parte A]: Mantener diccionario fijo y variar tamaño de entrenamiento con Laplace Smoothing")
    # Primero creamos un conjunto de entrenamiento y test fijos
    train_full_df, test_df = dividir_train_test(df)
    # Creamos un diccionario con todo el train, que será nuestro diccionario base
    stats_full = construir_diccionario(train_full_df)
    full_vocab = set(stats_full['word_counts_pos'].keys()).union(set(stats_full['word_counts_neg'].keys()))
    vocab_size = stats_full['vocab_size']

    for size in train_sizes:
        train_subset = train_full_df[:size]
        # Recalculamos las frecuencias solo considerando las palabras del vocab completo
        pos_texts = train_subset[train_subset['label'] == 1]['text']
        neg_texts = train_subset[train_subset['label'] == 0]['text']
        
        word_counts_pos = Counter()
        word_counts_neg = Counter()
        
        for text in pos_texts:
            word_counts_pos.update([w for w in text.split() if w in full_vocab])
        for text in neg_texts:
            word_counts_neg.update([w for w in text.split() if w in full_vocab])
        
        p_pos = len(pos_texts) / len(train_subset)
        p_neg = len(neg_texts) / len(train_subset)
        
        # Ahora creamos un stats con el vocab_size fijo
        stats_subset = {
            'word_counts_pos': word_counts_pos,
            'word_counts_neg': word_counts_neg,
            'p_pos': p_pos,
            'p_neg': p_neg,
            'vocab_size': vocab_size
        }
        
        acc = evaluar_modelo(test_df, stats_subset, alpha=alpha)
        print(f"Tamaño Train: {size}, accuracy: {acc:.4f}")

# Main para la Parte A
if __name__ == "__main__":
    # Cargamos el dataset (definir cargar_dataset, dividir_train_test, construir_diccionario previamente)
    ruta_csv = "./CRI_Practica3/FinalStemmedSentimentAnalysisDataset.csv"
    df = cargar_dataset(ruta_csv, nrows=100000)
    # Parámetros
    alpha = 1
    # Estrategia 1 con Laplace
    estrategia_1_laplace(df, incrementos = [20000, 40000, 60000, 80000], alpha=alpha)

    # Estrategia 2 con Laplace
    estrategia_2_laplace(df,  min_freqs = [1, 3, 5], alpha=alpha)

    # Estrategia 3 con Laplace
    estrategia_3_laplace(df, train_sizes = [20000, 40000, 60000, 80000], alpha=alpha)



[Estrategia 1 - Parte A]: Aumentar tamaño de entrenamiento con Laplace Smoothing
Tamaño Train: 20000, accuracy: 0.7522
Tamaño Train: 40000, accuracy: 0.7508
Tamaño Train: 60000, accuracy: 0.7614
Tamaño Train: 80000, accuracy: 0.7616

[Estrategia 2 - Parte A]: Variar tamaño del diccionario con Laplace Smoothing (train fijo)
Frecuencia mínima: 1, accuracy: 0.7597
Frecuencia mínima: 3, accuracy: 0.7514
Frecuencia mínima: 5, accuracy: 0.7462

[Estrategia 3 - Parte A]: Mantener diccionario fijo y variar tamaño de entrenamiento con Laplace Smoothing
Tamaño Train: 20000, accuracy: 0.7406
Tamaño Train: 40000, accuracy: 0.7533
Tamaño Train: 60000, accuracy: 0.7562
Tamaño Train: 80000, accuracy: 0.7597
