# Practice V Text classification
***
Rodriguez Nuñez Diego Eduardo

In [1]:
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score , ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from itertools import combinations
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, f1_score

In [2]:
nlp = spacy.load('es_core_news_sm')

In [3]:
data = pd.read_csv('raw_data_corpus.csv')
data

Unnamed: 0,Source,Title,Content,Section,URL,Date
0,La Jornada: Deportes,Récord de 96 mil asistentes para pelea Dubois ...,"Londres. Daniel Dubois, nueva estrella del box...",Deportes,https://www.jornada.com.mx/2024/09/21/deportes...,21/09/2024
1,La Jornada: Deportes,"América no está para formar jugadores, dice ...",Siendo el América uno de los clubes que menos ...,Deportes,https://www.jornada.com.mx/2024/09/21/deportes...,21/09/2024
2,La Jornada: Deportes,"Fátima Herrera, sin miedo a nada, hizo histor...",El surgimiento de referentes en el deporte de ...,Deportes,https://www.jornada.com.mx/2024/09/21/deportes...,21/09/2024
3,La Jornada: Deportes,Pumas femenil deja escapar los tres puntos en CU,"Pese a generar diversas oportunidades de gol, ...",Deportes,https://www.jornada.com.mx/2024/09/21/deportes...,21/09/2024
4,La Jornada: Deportes,Cae dupla Zverev-Alcaraz,La dupla Carlos Alcaraz-Alexander Zverev cayó ...,Deportes,https://www.jornada.com.mx/2024/09/21/deportes...,21/09/2024
...,...,...,...,...,...,...
393,Expansión - Tecnología,YouTube se corona como el rey del streaming en...,La plataforma se impuso como la más consumida ...,Tecnología,https://expansion.mx/tecnologia/2024/09/30/you...,30/09/2024
394,Expansión - Tecnología,Qué es y cómo funciona la tarifa dinámica de T...,Los hermanos Gallagher anunciaron que para la ...,Tecnología,https://expansion.mx/tecnologia/2024/09/30/que...,30/09/2024
395,Expansión - Tecnología,Trump advierte que irá contra Google por solo ...,El candidato apuntó que en caso de llegar a la...,Tecnología,https://expansion.mx/tecnologia/2024/09/30/tru...,30/09/2024
396,Expansión - Tecnología,"AT&T venderá participación en DirecTV por 7,60...",Esta operación permite a AT&amp;T continuar de...,Tecnología,https://expansion.mx/tecnologia/2024/09/30/att...,30/09/2024


In [4]:
data['Features'] = data['Title'] + ' ' + data['Content']
X = data['Features'].fillna('')
y = data['Section']

In [5]:
y = y.str.replace(r'\s+', ' ', regex=True)

Division del dataset

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [7]:
def NormalizarTexto(texto, aplicar):
    texto = texto.lower() if 'text_cleaning' in aplicar else texto

    doc = nlp(texto)
    tokens = []
    for token in doc:
        if not token.is_punct and not token.is_space:
            if 'stop_words' in aplicar and token.is_stop:
                continue
            tokens.append(token.lemma_ if 'lemmatization' in aplicar else token.text)
    return ' '.join(tokens) if 'tokenization' in aplicar else ' '.join([texto])

In [8]:
procesos = ['tokenization', 'stop_words', 'lemmatization', 'text_cleaning']
combos = [list(c) for i in range(1,len(procesos)+1) for c in combinations(procesos, i)]

resultados = {}

for combo in combos:
    print(f'Procesando combinacion: {combo}')
    X_train_norm = X_train.apply(lambda x: NormalizarTexto(x, aplicar=combo))
    X_test_norm = X_test.apply(lambda x: NormalizarTexto(x, aplicar=combo))
    resultados[str(combo)] = (X_train_norm[:5], X_test_norm[:5])

for combo, (tran_sample, test_sample) in resultados.items():
    print(f'Combinación: {combo}')
    print(f'Train: {tran_sample}')
    print(f'Test: {test_sample}')
    print("*"*50)

Procesando combinacion: ['tokenization']
Procesando combinacion: ['stop_words']
Procesando combinacion: ['lemmatization']
Procesando combinacion: ['text_cleaning']
Procesando combinacion: ['tokenization', 'stop_words']
Procesando combinacion: ['tokenization', 'lemmatization']
Procesando combinacion: ['tokenization', 'text_cleaning']
Procesando combinacion: ['stop_words', 'lemmatization']
Procesando combinacion: ['stop_words', 'text_cleaning']
Procesando combinacion: ['lemmatization', 'text_cleaning']
Procesando combinacion: ['tokenization', 'stop_words', 'lemmatization']
Procesando combinacion: ['tokenization', 'stop_words', 'text_cleaning']
Procesando combinacion: ['tokenization', 'lemmatization', 'text_cleaning']
Procesando combinacion: ['stop_words', 'lemmatization', 'text_cleaning']
Procesando combinacion: ['tokenization', 'stop_words', 'lemmatization', 'text_cleaning']
Combinación: ['tokenization']
Train: 3      Pumas femenil deja escapar los tres puntos en ...
18     Lun represen

In [9]:
representaciones = {}

for combo in combos:
    print(f'\nGenerando representación para combinación: {combo}')

    X_test_norm = X_test.apply(lambda x: NormalizarTexto(x, aplicar=combo))
    X_train_norm = X_train.apply(lambda x: NormalizarTexto(x, aplicar=combo))

    #Binarizada
    vectorizer_binary = CountVectorizer(binary=True)
    X_train_binary = vectorizer_binary.fit_transform(X_train_norm)
    X_test_binary = vectorizer_binary.transform(X_test_norm)
    representaciones[f'{combo} + Binarized'] = (X_train_binary, X_test_binary)
    print(f'Combinacion: {combo} + Binarized\nPreview:',X_train_binary[:5].toarray())

    #Frecuencia
    vectorizer_freq = CountVectorizer()
    X_train_freq = vectorizer_freq.fit_transform(X_train_norm)
    X_test_freq = vectorizer_freq.transform(X_test_norm)
    representaciones[f'{combo} + Frequency'] = (X_train_freq, X_test_freq)
    print(f'Combinacion: {combo} + Frequency\nPreview:',X_train_freq[:5].toarray())

    #TF-IDF
    vectorizer_tfidf = TfidfVectorizer()
    X_train_tfidf = vectorizer_tfidf.fit_transform(X_train_norm)
    X_test_tfidf = vectorizer_tfidf.transform(X_test_norm)
    representaciones[f'{combo} + TF-IDF'] = (X_train_tfidf, X_test_tfidf)
    print(f'Combinacion: {combo} + TF-IDF\nPreview:',X_train_tfidf[:5].toarray())

    #TF-IDF + SVD
    svd = TruncatedSVD(n_components=100, random_state=42)
    X_train_svd = svd.fit_transform(X_train_tfidf)
    X_test_svd = svd.transform(X_test_tfidf)
    representaciones[f'{combo} + TF-IDF + SVD'] = (X_train_svd, X_test_svd)
    print(f'Combinacion: {combo} + TF-IDF + SVD\nPreview:',X_train_svd[:3])


Generando representación para combinación: ['tokenization']
Combinacion: ['tokenization'] + Binarized
Preview: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Combinacion: ['tokenization'] + Frequency
Preview: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
Combinacion: ['tokenization'] + TF-IDF
Preview: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
Combinacion: ['tokenization'] + TF-IDF + SVD
Preview: [[ 0.21326042  0.20259188 -0.21368175  0.07867552  0.02859727 -0.00275321
  -0.1248336   0.05596382  0.04293896  0.13831039  0.12841583  0.12484692
   0.0677793  -0.08529728 -0.03492736 -0.00711285 -0.03153037  0.15474409
  -0.03299685 -0.06695752  0.05775113 -0.06450361 -0.02013199  0.06130594
  -0.04410444 -0.06527795  0.05558361 -0.12583295  0.02887727  0.00171073
  -0.02911358 -0.1331649   0.0520549  -0.018180

Entrenar y evaluar clasificadores

In [10]:
def TrainandEvaluate(classifier,X_train,X_test,y_train,y_test):
    classifier.fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='macro')
    return f1, y_pred

In [11]:
classifiers ={
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(200,100)),
    'Gradient Boosting': GradientBoostingClassifier()
}

In [12]:
results = {}

for clf_name, clf in classifiers.items():
    results[clf_name] = {}
    for com_rep_name, (X_train_rep, X_test_rep) in representaciones.items():
        if clf_name == "Naive Bayes" and "TF-IDF" in com_rep_name:
            print(f"Skipping {clf_name} with {com_rep_name}")
            continue
        print(f'\nEvaluando {clf_name} con combincion:{com_rep_name}')
        try:
            f1, y_pred = TrainandEvaluate(clf,X_train_rep,X_test_rep,y_train,y_test)
            results[clf_name][com_rep_name] = f1
            print(f'F1 Score para {clf_name} con {com_rep_name}: {f1:.3f}')
            print(f'\nReporte de Clasificación para {clf_name} con {com_rep_name}:')
            print(classification_report(y_test, y_pred, zero_division=0))
        except Exception as e:
            print(f'Error al evaluar {clf_name} con {com_rep_name}: {e}')


Evaluando Logistic Regression con combincion:['tokenization'] + Binarized
F1 Score para Logistic Regression con ['tokenization'] + Binarized: 0.596

Reporte de Clasificación para Logistic Regression con ['tokenization'] + Binarized:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       1.00      0.57      0.73         7
    Deportes       0.90      0.82      0.86        11
    Economía       0.71      0.81      0.75        36
  Tecnología       0.64      0.64      0.64        25

    accuracy                           0.72        80
   macro avg       0.65      0.57      0.60        80
weighted avg       0.73      0.72      0.72        80


Evaluando Logistic Regression con combincion:['tokenization'] + Frequency
F1 Score para Logistic Regression con ['tokenization'] + Frequency: 0.580

Reporte de Clasificación para Logistic Regression con ['tokenization'] + Frequency:
              precision    recall  f1-score



F1 Score para Neural Network con ['tokenization'] + TF-IDF + SVD: 0.630

Reporte de Clasificación para Neural Network con ['tokenization'] + TF-IDF + SVD:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       0.71      0.71      0.71         7
    Deportes       0.83      0.91      0.87        11
    Economía       0.81      0.81      0.81        36
  Tecnología       0.76      0.76      0.76        25

    accuracy                           0.79        80
   macro avg       0.62      0.64      0.63        80
weighted avg       0.78      0.79      0.78        80


Evaluando Neural Network con combincion:['stop_words'] + Binarized
F1 Score para Neural Network con ['stop_words'] + Binarized: 0.533

Reporte de Clasificación para Neural Network con ['stop_words'] + Binarized:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       1.00      0.29 



F1 Score para Neural Network con ['stop_words'] + TF-IDF + SVD: 0.636

Reporte de Clasificación para Neural Network con ['stop_words'] + TF-IDF + SVD:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       0.67      0.86      0.75         7
    Deportes       0.83      0.91      0.87        11
    Economía       0.82      0.78      0.80        36
  Tecnología       0.76      0.76      0.76        25

    accuracy                           0.79        80
   macro avg       0.62      0.66      0.64        80
weighted avg       0.78      0.79      0.78        80


Evaluando Neural Network con combincion:['lemmatization'] + Binarized
F1 Score para Neural Network con ['lemmatization'] + Binarized: 0.564

Reporte de Clasificación para Neural Network con ['lemmatization'] + Binarized:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       1.00      



F1 Score para Neural Network con ['tokenization', 'stop_words'] + TF-IDF + SVD: 0.596

Reporte de Clasificación para Neural Network con ['tokenization', 'stop_words'] + TF-IDF + SVD:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       1.00      0.43      0.60         7
    Deportes       0.71      0.91      0.80        11
    Economía       0.78      0.81      0.79        36
  Tecnología       0.77      0.80      0.78        25

    accuracy                           0.78        80
   macro avg       0.65      0.59      0.60        80
weighted avg       0.78      0.78      0.77        80


Evaluando Neural Network con combincion:['tokenization', 'lemmatization'] + Binarized
F1 Score para Neural Network con ['tokenization', 'lemmatization'] + Binarized: 0.572

Reporte de Clasificación para Neural Network con ['tokenization', 'lemmatization'] + Binarized:
              precision    recall  f1-score   support

   



F1 Score para Neural Network con ['tokenization', 'text_cleaning'] + TF-IDF + SVD: 0.623

Reporte de Clasificación para Neural Network con ['tokenization', 'text_cleaning'] + TF-IDF + SVD:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       0.71      0.71      0.71         7
    Deportes       0.83      0.91      0.87        11
    Economía       0.78      0.81      0.79        36
  Tecnología       0.75      0.72      0.73        25

    accuracy                           0.78        80
   macro avg       0.62      0.63      0.62        80
weighted avg       0.76      0.78      0.77        80


Evaluando Neural Network con combincion:['stop_words', 'lemmatization'] + Binarized
F1 Score para Neural Network con ['stop_words', 'lemmatization'] + Binarized: 0.520

Reporte de Clasificación para Neural Network con ['stop_words', 'lemmatization'] + Binarized:
              precision    recall  f1-score   support

   



F1 Score para Neural Network con ['stop_words', 'lemmatization'] + TF-IDF + SVD: 0.644

Reporte de Clasificación para Neural Network con ['stop_words', 'lemmatization'] + TF-IDF + SVD:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       0.83      0.71      0.77         7
    Deportes       0.83      0.91      0.87        11
    Economía       0.81      0.83      0.82        36
  Tecnología       0.76      0.76      0.76        25

    accuracy                           0.80        80
   macro avg       0.65      0.64      0.64        80
weighted avg       0.79      0.80      0.79        80


Evaluando Neural Network con combincion:['stop_words', 'text_cleaning'] + Binarized
F1 Score para Neural Network con ['stop_words', 'text_cleaning'] + Binarized: 0.573

Reporte de Clasificación para Neural Network con ['stop_words', 'text_cleaning'] + Binarized:
              precision    recall  f1-score   support

    Cie



F1 Score para Neural Network con ['stop_words', 'text_cleaning'] + TF-IDF + SVD: 0.636

Reporte de Clasificación para Neural Network con ['stop_words', 'text_cleaning'] + TF-IDF + SVD:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       0.67      0.86      0.75         7
    Deportes       0.83      0.91      0.87        11
    Economía       0.82      0.78      0.80        36
  Tecnología       0.76      0.76      0.76        25

    accuracy                           0.79        80
   macro avg       0.62      0.66      0.64        80
weighted avg       0.78      0.79      0.78        80


Evaluando Neural Network con combincion:['lemmatization', 'text_cleaning'] + Binarized
F1 Score para Neural Network con ['lemmatization', 'text_cleaning'] + Binarized: 0.543

Reporte de Clasificación para Neural Network con ['lemmatization', 'text_cleaning'] + Binarized:
              precision    recall  f1-score   support



F1 Score para Neural Network con ['tokenization', 'stop_words', 'lemmatization'] + TF-IDF + SVD: 0.564

Reporte de Clasificación para Neural Network con ['tokenization', 'stop_words', 'lemmatization'] + TF-IDF + SVD:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       0.75      0.43      0.55         7
    Deportes       0.59      0.91      0.71        11
    Economía       0.82      0.78      0.80        36
  Tecnología       0.76      0.76      0.76        25

    accuracy                           0.75        80
   macro avg       0.58      0.58      0.56        80
weighted avg       0.75      0.75      0.74        80


Evaluando Neural Network con combincion:['tokenization', 'stop_words', 'text_cleaning'] + Binarized
F1 Score para Neural Network con ['tokenization', 'stop_words', 'text_cleaning'] + Binarized: 0.548

Reporte de Clasificación para Neural Network con ['tokenization', 'stop_words', 'text_cleani



F1 Score para Neural Network con ['tokenization', 'stop_words', 'text_cleaning'] + TF-IDF + SVD: 0.612

Reporte de Clasificación para Neural Network con ['tokenization', 'stop_words', 'text_cleaning'] + TF-IDF + SVD:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       1.00      0.57      0.73         7
    Deportes       0.69      1.00      0.81        11
    Economía       0.79      0.75      0.77        36
  Tecnología       0.73      0.76      0.75        25

    accuracy                           0.76        80
   macro avg       0.64      0.62      0.61        80
weighted avg       0.77      0.76      0.76        80


Evaluando Neural Network con combincion:['tokenization', 'lemmatization', 'text_cleaning'] + Binarized
F1 Score para Neural Network con ['tokenization', 'lemmatization', 'text_cleaning'] + Binarized: 0.539

Reporte de Clasificación para Neural Network con ['tokenization', 'lemmatization', 'te



F1 Score para Neural Network con ['stop_words', 'lemmatization', 'text_cleaning'] + TF-IDF + SVD: 0.619

Reporte de Clasificación para Neural Network con ['stop_words', 'lemmatization', 'text_cleaning'] + TF-IDF + SVD:
              precision    recall  f1-score   support

    Ciencias       0.00      0.00      0.00         1
     Cultura       0.71      0.71      0.71         7
    Deportes       0.77      0.91      0.83        11
    Economía       0.80      0.78      0.79        36
  Tecnología       0.76      0.76      0.76        25

    accuracy                           0.78        80
   macro avg       0.61      0.63      0.62        80
weighted avg       0.77      0.78      0.77        80


Evaluando Neural Network con combincion:['tokenization', 'stop_words', 'lemmatization', 'text_cleaning'] + Binarized
F1 Score para Neural Network con ['tokenization', 'stop_words', 'lemmatization', 'text_cleaning'] + Binarized: 0.625

Reporte de Clasificación para Neural Network con ['token

Los 3 mejores resultados por clasificador

In [13]:
for clf_name, clf_scores in results.items():
    print(f'\nClasificador: {clf_name}')
    top_combos = sorted(clf_scores.items(), key=lambda x: x[1], reverse=True)[:3]

    for rank, (combo_name, f1) in enumerate(top_combos, start=1):
        print(f'{rank+1}. Combinación: {combo_name}, F1 Score: {f1:.3f}')

        X_train_rep, X_test_rep = representaciones[combo_name]
        clf = classifiers[clf_name]
        clf.fit(X_train_rep, y_train)
        y_pred = clf.predict(X_test_rep)


Clasificador: Logistic Regression
2. Combinación: ['tokenization', 'lemmatization', 'text_cleaning'] + Frequency, F1 Score: 0.628
3. Combinación: ['tokenization', 'lemmatization', 'text_cleaning'] + Binarized, F1 Score: 0.611
4. Combinación: ['tokenization', 'lemmatization'] + Binarized, F1 Score: 0.610

Clasificador: Naive Bayes
2. Combinación: ['tokenization', 'stop_words', 'lemmatization', 'text_cleaning'] + Frequency, F1 Score: 0.621
3. Combinación: ['tokenization', 'stop_words', 'lemmatization'] + Frequency, F1 Score: 0.614
4. Combinación: ['tokenization', 'stop_words'] + Binarized, F1 Score: 0.605

Clasificador: Decision Tree
2. Combinación: ['tokenization', 'stop_words', 'lemmatization'] + TF-IDF, F1 Score: 0.503
3. Combinación: ['stop_words', 'lemmatization', 'text_cleaning'] + TF-IDF + SVD, F1 Score: 0.501
4. Combinación: ['tokenization', 'lemmatization'] + TF-IDF, F1 Score: 0.490

Clasificador: SVM
2. Combinación: ['tokenization', 'stop_words'] + TF-IDF + SVD, F1 Score: 0.53



4. Combinación: ['text_cleaning'] + TF-IDF + SVD, F1 Score: 0.636





Clasificador: Gradient Boosting
2. Combinación: ['tokenization', 'stop_words', 'lemmatization', 'text_cleaning'] + TF-IDF + SVD, F1 Score: 0.563
3. Combinación: ['tokenization', 'text_cleaning'] + Frequency, F1 Score: 0.555
4. Combinación: ['lemmatization', 'text_cleaning'] + Frequency, F1 Score: 0.555
