# 1.3 CountVectorizer

Inicialmente se va a realizar un preprocesado de los datos, eliminando ls palabras sin significado útil, los url y los signos de puntuación.

In [1]:
#Necessary libraries
import pandas as pd
import numpy as np

#Importing the dataset
df = pd.read_csv('./data/train.csv')

In [2]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [4]:
en_stopwords = nlp.Defaults.stop_words

def remove_stop_words(text):
    cleanText = ''
    phrase = nlp(text)
    for token in phrase:
        if not token.is_stop and not token.is_punct and not token.like_url:
            cleanText += ' ' + token.text

    return cleanText

df['text_cleaned'] = df['text'].apply(remove_stop_words)

In [5]:
df['text_cleaned']

0        Hi Roy hope ok Trans people gay thing s ramme...
1                                     fuckin hell biology
2                                  nice looking clergyman
3                           AIDS WAY SIN CONSEQUENCES BAD
4                                                   learn
                              ...                        
8143     Yeah alive time election happen fairly soon U...
8144                                  fundamentally wrong
8145     confused homosexuality big deal proud normal ...
8146                                           disgusting
8147     Peter Sørensen note Peter poor maths 13 27 eq...
Name: text_cleaned, Length: 8148, dtype: object

Tras el preprocesado de datos se va a usar CountVectorizer para transformar los tokens y Random Forest para realizar las predicciones.

In [6]:
from sklearn.model_selection import train_test_split

X = df['text_cleaned']
y = df['label']

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.33, random_state=42)

Se usa Grid Search para buscar los parámetros óptimos en la vectorización y en el modelo.

In [7]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

# Define tu pipeline
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('clf', RandomForestClassifier())
])

# Define los parámetros para la búsqueda 
parameters = {
    'count_vectorizer__ngram_range': [(1, 1), (1, 2)],
    'count_vectorizer__max_features': [1000, 5000, None],
    'clf__n_estimators': [50, 100, 200],  
    'clf__max_depth': [None, 10, 20],  
}

# Realiza la búsqueda de hiperparámetros utilizando GridSearchCV
grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_


In [8]:
# Crea una nueva pipeline con los mejores parámetros encontrados
best_pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('clf', RandomForestClassifier())
])

best_pipeline.set_params(**best_params)

best_pipeline.fit(X_train, y_train)

predictions = best_pipeline.predict(X_test)

In [9]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, predictions))

[[1445  255]
 [ 461  528]]


In [10]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.76      0.85      0.80      1700
           1       0.67      0.53      0.60       989

    accuracy                           0.73      2689
   macro avg       0.72      0.69      0.70      2689
weighted avg       0.73      0.73      0.73      2689



In [11]:
from sklearn import metrics
metrics.accuracy_score(y_test, predictions)

0.7337300111565638

Finalmente, se consigue una precisión muy baja, muy paredida al sisyema que utiliza TF-IDF y LinearSVC.