# 1. Procesado de datos

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/train.csv')

In [None]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [None]:
en_stopwords = nlp.Defaults.stop_words

def remove_stop_words(text):
    cleanText = ''
    phrase = nlp(text)
    for token in phrase:
        if not token.is_stop and not token.is_punct and not token.like_url:
            cleanText += ' ' + token.text

    return cleanText

df['text_cleaned'] = df['text'].apply(remove_stop_words)

In [None]:
df['text_cleaned']

Separamos el conjunto en entrenamiento y test.

In [None]:
# spling training and testing
from sklearn.model_selection import train_test_split

X = df['text_cleaned']
y = df['label']

#train
X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# Define el pipeline
text_clf = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LinearSVC())
])

# Define los parámetros para la búsqueda de cuadrícula de TfidfVectorizer
parameters_tfidf = {
    'tfidf__max_df': (0.25, 0.5, 0.75),
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    'tfidf__stop_words': (None, 'english'),
    'tfidf__token_pattern': (r"\b\w\w+\b", r"\b[a-zA-Z]{3,}\b"),
    'tfidf__norm': ('l1', 'l2', None),
    'tfidf__use_idf': (True, False),
    'tfidf__sublinear_tf': (True, False)
}

# Define los parámetros para la búsqueda de cuadrícula de LinearSVC
parameters_svc = {
    'clf__penalty': ('l1', 'l2'),
    'clf__loss': ('hinge', 'squared_hinge'),
    'clf__dual': (True, False),
    'clf__tol': (1e-4, 1e-3),
    'clf__C': (0.1, 1, 10),
    'clf__multi_class': ('ovr', 'crammer_singer'),
    'clf__fit_intercept': (True, False),
    'clf__intercept_scaling': (1.0, 2.0),
    'clf__class_weight': (None, 'balanced'),
    'clf__verbose': (0, 1),
    'clf__random_state': (None, 42),
    'clf__max_iter': (1000, 2000)
}

# Realiza la búsqueda de cuadrícula para TfidfVectorizer
grid_search_tfidf = GridSearchCV(text_clf, parameters_tfidf, cv=5, n_jobs=-1)
grid_search_tfidf.fit(X_train, y_train)

# Muestra los mejores parámetros encontrados para TfidfVectorizer
print("Mejores parámetros para TfidfVectorizer:")
print(grid_search_tfidf.best_params_)

# Realiza la búsqueda de cuadrícula para LinearSVC
grid_search_svc = GridSearchCV(text_clf, parameters_svc, cv=5, n_jobs=-1)
grid_search_svc.fit(X_train, y_train)

# Muestra los mejores parámetros encontrados para LinearSVC
print("Mejores parámetros para LinearSVC:")
print(grid_search_svc.best_params_)



In [None]:
# Obtén los mejores parámetros encontrados para TfidfVectorizer y LinearSVC
best_params_tfidf = grid_search_tfidf.best_params_
best_params_svc = grid_search_svc.best_params_

# Crea una nueva pipeline con los mejores parámetros encontrados
best_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(**best_params_tfidf)),
    ('clf', LinearSVC(**best_params_svc))
])


best_pipeline.fit(X_train, y_train)

predictions = best_pipeline.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
print(confusion_matrix(y_test, predictions))

In [None]:
print(classification_report(y_test, predictions))

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, predictions)

In [None]:
text_clf.predict(["Kill them where they lie ...kids as well"])