# <center> Trabajo Practico 2 </center>
### <center> Grupo 10 </center>
## Integrantes:
#### Alan Richmond
#### Flavian Ferré


# Importaciones

In [3]:
!pip install stop_words



## Librerias

In [5]:
import datetime
import joblib
import numpy as np
import pandas as pd
import spacy
import stop_words

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfVectorizer
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score, GridSearchCV, RandomizedSearchCV, train_test_split
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

## Conjuntos

In [6]:
conjunto_train = pd.read_csv('Datasets/train.csv')
conjunto_test = pd.read_csv('Datasets/test.csv')
sample_solution = pd.read_csv('Datasets/sample_solution.csv')

In [7]:
X = conjunto_train['review_es']
y = conjunto_train['sentimiento'].map({'positivo': 1, 'negativo': 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

stop_words_es = stop_words.get_stop_words('es')

In [9]:
X_train.to_csv('Datasets/X_train.csv')
X_test.to_csv('Datasets/X_test.csv')
y_train.to_csv('Datasets/y_train.csv')
y_test.to_csv('Datasets/y_test.csv')

In [8]:
nlp = spacy.load('es_core_news_sm')

def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if token.is_alpha])

X_train_processed = X_train.apply(preprocess_text)
X_test_processed = X_test.apply(preprocess_text)
X_train_processed.to_csv('Datasets/X_train_processed.csv')
X_test_processed.to_csv('Datasets/X_test_processed.csv')

# 1. Bayes Naïve

## Vectorizacion de las críticas

Vamos a hacer diferentes vectorizaciones para elegir la mejor :
- Vect_1 : TFIDF Vectorizer sin lemmarizacion y con hiperparametros afinados
- Vect_2 : TFIDF Vectorizer con lemmarizacion y con hiperparametros afinados
- Vect_3 : Count Vectorizer sin lemmarizacion y con hiperparametros afinados
- Vect_4 : Count Vectorizer con lemmarizacion y con hiperparametros afinados

Quizas utilizaremos los NLP (HuggingFace) en las proximas semanas.

In [24]:
X_train_processed = pd.read_csv('Datasets/X_train_processed.csv', index_col=0)['review_es']
X_test_processed = pd.read_csv('Datasets/X_test_processed.csv', index_col=0)['review_es']

39087    ese ser él que seguí preguntándomar durante el...
30893    no ver a todo el película no poder ver el pelí...
45278    uno historia de amor conmovedora que recordar ...
16398    este último día Fulci Schlocker ser uno mezcla...
13653    en primero lugar creer firmemente que el pelíc...
                               ...                        
11284    Sombra Magic recapturar el alegría y el asombr...
44732    Encontré que este película ser bastante agrada...
38158    Evita uno película terrible entonces y si ser ...
860      este producción ser bastante sorpresa para yo ...
15795    este ser uno película decente aunque poco a ti...
Name: review_es, Length: 40000, dtype: object

In [29]:
best_configs = []

configs = [
    {'min_df': 1, 'ngram_range': (1, 2)},
    {'min_df': 1, 'ngram_range': (1, 3)},
    {'min_df': 1, 'ngram_range': (1, 4)},
    {'min_df': 2, 'ngram_range': (1, 2)},
    {'min_df': 2, 'ngram_range': (1, 3)},
    {'min_df': 2, 'ngram_range': (1, 4)},
    {'min_df': 3, 'ngram_range': (1, 2)},
    {'min_df': 3, 'ngram_range': (1, 3)},
    {'min_df': 3, 'ngram_range': (1, 4)},
    {'min_df': 4, 'ngram_range': (1, 2)},
    {'min_df': 4, 'ngram_range': (1, 3)},
    {'min_df': 4, 'ngram_range': (1, 4)},
    {'min_df': 5, 'ngram_range': (1, 2)},
    {'min_df': 5, 'ngram_range': (1, 3)},
    {'min_df': 5, 'ngram_range': (1, 4)},
    {'min_df': 8, 'ngram_range': (1, 2)},
    {'min_df': 8, 'ngram_range': (1, 3)},
    {'min_df': 8, 'ngram_range': (1, 4)},
    {'min_df': 10, 'ngram_range': (1, 2)},
    {'min_df': 10, 'ngram_range': (1, 3)},
    {'min_df': 10, 'ngram_range': (1, 4)},
    {'min_df': 15, 'ngram_range': (1, 2)},
    {'min_df': 15, 'ngram_range': (1, 3)},
    {'min_df': 15, 'ngram_range': (1, 4)},
]

nb_old_model = joblib.load('Modelos/bn_model_2024-05-29.joblib')

def test_config(config, X_train, vectorizer):
    vect = vectorizer(stop_words=stop_words_es, **config)
    X_train_vect = vect.fit_transform(X_train)
    score = cross_val_score(nb_old_model, X_train_vect, y_train, cv=10, scoring='f1_macro')
    return round(score.mean(), 6)

### Vect_1

In [30]:
best_score_1 = 0
best_config_1 = None

for config in configs:
    score = test_config(config, X_train, TfidfVectorizer)
    if score > best_score_1:
        best_score_1 = score
        best_config_1 = config

best_configs.append([best_score_1, best_config_1, False, "TFIDF"])

print("Mejor config Vect_1 :", best_config_1)
print("Mejor F1-Score Vect_1 :", best_score_1)

Mejor config Vect_1 : {'min_df': 2, 'ngram_range': (1, 4)}
Mejor F1-Score Vect_1 : 0.886291


### Vect_2

In [31]:
best_score_2 = 0
best_config_2 = None

for config in configs:
    score = test_config(config, X_train_processed, TfidfVectorizer)
    if score > best_score_2:
        best_score_2 = score
        best_config_2 = config

best_configs.append([best_score_2, best_config_2, True, "TFIDF"])

print("Mejor config Vect_2 :", best_config_2)
print("Mejor F1-Score Vect_2 :", best_score_2)

Mejor config Vect_2 : {'min_df': 2, 'ngram_range': (1, 4)}
Mejor F1-Score Vect_2 : 0.882277


### Vect_3

In [32]:
best_score_3 = 0
best_config_3 = None

for config in configs:
    score = test_config(config, X_train, CountVectorizer)
    if score > best_score_3:
        best_score_3 = score
        best_config_3 = config

best_configs.append([best_score_3, best_config_3, False, "Count"])

print("Mejor config Vect_3 :", best_config_3)
print("Mejor F1-Score Vect_3 :", best_score_3)

Mejor config Vect_3 : {'min_df': 2, 'ngram_range': (1, 4)}
Mejor F1-Score Vect_3 : 0.878847


### Vect_4

In [33]:
best_score_4 = 0
best_config_4 = None

for config in configs:
    score = test_config(config, X_train_processed, CountVectorizer)
    if score > best_score_4:
        best_score_4 = score
        best_config_4 = config

best_configs.append([best_score_4, best_config_4, True, "Count"])

print("Mejor config Vect_4 :", best_config_4)
print("Mejor F1-Score Vect_4 :", best_score_4)

Mejor config Vect_4 : {'min_df': 2, 'ngram_range': (1, 4)}
Mejor F1-Score Vect_4 : 0.875668


### Mejor Vect

In [43]:
best_configs_df = pd.DataFrame(data=best_configs, columns=["Score", "Config", "Lemmatization", "Type of vectorization"])
best_configs_df

Unnamed: 0,Score,Config,Lemmatization,Type of vectorization
0,0.886291,"{'min_df': 2, 'ngram_range': (1, 4)}",False,TFIDF
1,0.882277,"{'min_df': 2, 'ngram_range': (1, 4)}",True,TFIDF
2,0.878847,"{'min_df': 2, 'ngram_range': (1, 4)}",False,Count
3,0.875668,"{'min_df': 2, 'ngram_range': (1, 4)}",True,Count


In [48]:
best_index = best_configs_df['Score'].idxmax()

best_score = best_configs_df.loc[best_index, 'Score']
best_config = best_configs_df.loc[best_index, 'Config']
bool_lemma = best_configs_df.loc[best_index, 'Lemmatization']
type_vect = best_configs_df.loc[best_index, 'Type of vectorization']
print(best_score, best_config, bool_lemma, type_vect)

0.886291 {'min_df': 2, 'ngram_range': (1, 4)} False TFIDF


In [49]:
if bool_lemma :
    X_train = X_train_processed
    X_test = X_test_processed
if type_vect == "TFIDF":
    vect = TfidfVectorizer(stop_words=stop_words_es, **best_config)
elif type_vect == "Count":
    vect = CountVectorizer(stop_words=stop_words_es, **best_config)
elif type_vect == "Hashing":
    vect = HashingVectorizer(stop_words=stop_words_es, **best_config)

X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)

## Busqueda de los hiperparametros

In [50]:
param_distributions = {
    'alpha': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.5, 2.0, 3.0, 5.0]
}

nb_model = MultinomialNB()
f1_scorer = make_scorer(f1_score)

random_search = RandomizedSearchCV(estimator=nb_model, param_distributions=param_distributions, cv=5, n_iter=30, scoring=f1_scorer)

random_search.fit(X_train_vect, y_train)

best_params = random_search.best_params_
best_score = random_search.best_score_

print(f'Mejores parametros: {best_params}')
print(f'Mejor F1 Score: {best_score}')

best_params = random_search.best_params_
param_grid = {
    'alpha': [best_params['alpha'] - 0.05, best_params['alpha'], best_params['alpha'] + 0.05]
}

grid_search = GridSearchCV(estimator=nb_model, param_grid=param_grid, cv=5, scoring=f1_scorer)
grid_search.fit(X_train_vect, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f'Mejores parametros (afinados): {best_params}')
print(f'Mejor F1 Score (afinados): {best_score}')



Mejores parametros: {'alpha': 0.2}
Mejor F1 Score: 0.8872150655084512
Mejores parametros (afinados): {'alpha': 0.2}
Mejor F1 Score (afinados): 0.8872150655084512


In [51]:
best_nb_model = grid_search.best_estimator_
best_nb_model.fit(X_train_vect, y_train)
y_pred = best_nb_model.predict(X_test_vect)

f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

F1 Score: 0.8897783852386997


## Conjunto test

In [52]:
conjunto_test = conjunto_test.set_index(conjunto_test['ID'])
X_conjunto_test = vect.transform(conjunto_test['review_es'])

pred_test = best_nb_model.predict(X_conjunto_test)
pred_test_labels = ['positivo' if pred == 1 else 'negativo' for pred in pred_test]

final_pred_df = pd.DataFrame({
    'ID': conjunto_test.index,
    'sentimiento': pred_test_labels
})
final_pred_df

Unnamed: 0,ID,sentimiento
0,60000,negativo
1,60001,negativo
2,60002,negativo
3,60003,positivo
4,60004,negativo
...,...,...
8594,68594,positivo
8595,68595,negativo
8596,68596,negativo
8597,68597,negativo


## Exportaciones

In [53]:
current_date = datetime.datetime.now().strftime('%Y-%m-%d')

final_pred_df.to_csv(f"Predicciones/Bayes_Naïve{current_date}.csv", index=False)

In [54]:
joblib.dump(best_nb_model, f'Modelos/bn_model_{current_date}.joblib')

['Modelos/bn_model_2024-05-31.joblib']

# 2. Random Forest

## Vectorizacion de las críticas

In [12]:
X = conjunto_train['review_es']
y = conjunto_train['sentimiento'].map({'positivo': 1, 'negativo': 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

stop_words_es = stop_words.get_stop_words('es')
vectorizer = TfidfVectorizer(stop_words=stop_words_es)

X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

## Busqueda de los hiperparametros

In [13]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

rf_model.fit(X_train_vect, y_train)
y_pred = rf_model.predict(X_test_vect)

f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

F1 Score: 0.8135886825137137


## Conjunto test

In [14]:
conjunto_test = conjunto_test.set_index(conjunto_test['ID'])
X_conjunto_test = vectorizer.transform(conjunto_test['review_es'])

pred_test = rf_model.predict(X_conjunto_test)
pred_test_labels = ['positivo' if pred == 1 else 'negativo' for pred in pred_test]

final_pred_df = pd.DataFrame({
    'ID': conjunto_test.index,
    'sentimiento': pred_test_labels
})
final_pred_df

Unnamed: 0,ID,sentimiento
0,60000,positivo
1,60001,negativo
2,60002,positivo
3,60003,negativo
4,60004,positivo
...,...,...
8594,68594,positivo
8595,68595,positivo
8596,68596,positivo
8597,68597,negativo


## Exportaciones

In [15]:
current_date = datetime.datetime.now().strftime('%Y-%m-%d')

final_pred_df.to_csv(f"Predicciones/RandomForest_{current_date}.csv", index=False)

In [17]:
joblib.dump(rf_model, f'Modelos/rf_model_{current_date}.joblib')

['Modelos/rf_model_2024-05-28.joblib']

# 3. XGBoost

## Vectorizacion de las críticas : sin lemmarizacion

In [18]:
X = conjunto_train['review_es']
y = conjunto_train['sentimiento'].map({'positivo': 1, 'negativo': 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

stop_words_es = stop_words.get_stop_words('es')
vectorizer = TfidfVectorizer(stop_words=stop_words_es)

X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

## Busqueda de los hiperparametros

In [19]:
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.15, 0.2],
    'gamma': [0, 0.25, 0.5, 1, 2, 3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

xgb = XGBClassifier()
f1_scorer = make_scorer(f1_score)

sample_size = int(0.1 * X_train_vect.shape[0])
sample_indices = np.random.choice(X_train_vect.shape[0], size=sample_size, replace=False)
X_train_sampled = X_train_vect[sample_indices]
y_train_sampled = y_train.iloc[sample_indices]

In [None]:
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_distributions, cv=5, n_iter=30, scoring=f1_scorer)

random_search.fit(X_train_sampled, y_train_sampled)

In [None]:
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f'Mejores parametros: {best_params}')
print(f'Mejor F1 Score: {best_score}')

In [None]:
best_xgb_model = random_search.best_estimator_
best_xgb_model.fit(X_train_vect, y_train, early_stopping_rounds=10, eval_set=[(X_test_vect, y_test)])
y_pred = best_xgb_model.predict(X_test_vect)

f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

In [None]:
current_date = datetime.datetime.now().strftime('%Y-%m-%d')

joblib.dump(random_search, f'Modelos/xgb_model_random_search_{current_date}.joblib')
joblib.dump(best_xgb_model, f'Modelos/xgb_model_{current_date}.joblib')

## Conjunto test

In [None]:
conjunto_test = conjunto_test.set_index(conjunto_test['ID'])
X_conjunto_test = vectorizer.transform(conjunto_test['review_es'])

pred_test = best_xgb_model.predict(X_conjunto_test)
pred_test_labels = ['positivo' if pred == 1 else 'negativo' for pred in pred_test]

final_pred_df = pd.DataFrame({
    'ID': conjunto_test.index,
    'sentimiento': pred_test_labels
})
final_pred_df

## Exportaciones

In [None]:
current_date = datetime.datetime.now().strftime('%Y-%m-%d')

final_pred_df.to_csv(f"Predicciones/XGBoost_{current_date}.csv", index=False)

## Vectorizacion de las críticas : con lemmarizacion

In [None]:
nlp = spacy.load('es_core_news_sm')

def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if token.is_alpha])

X_train_processed = X_train.apply(preprocess_text)
print(X_train_processed)

vectorizer_lemma = TfidfVectorizer(stop_words=stop_words_es)

X_train_vect_lemma = vectorizer_lemma.fit_transform(X_train_processed)
X_test_vect_lemma = vectorizer_lemma.transform(X_test)

param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0, 0.02, 0.04, 0.06, 0.08, 0.1, 0.15, 0.2],
    'gamma': [0, 0.25, 0.5, 1, 2, 3],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'early_stopping_rounds': [5, 10, 20]
}

xgb = XGBClassifier()
f1_scorer = make_scorer(f1_score)

sample_size_lemma = int(0.1 * X_train_vect_lemma.shape[0])
sample_indices_lemma = np.random.choice(X_train_vect_lemma.shape[0], size=sample_size_lemma, replace=False)
X_train_sampled_lemma = X_train_vect_lemma[sample_indices_lemma]
y_train_sampled_lemma = y_train.iloc[sample_indices_lemma]

random_search_lemma = RandomizedSearchCV(estimator=xgb, param_distributions=param_distributions, cv=5, n_iter=30, scoring=f1_scorer)

random_search_lemma.fit(X_train_sampled_lemma, y_train_sampled_lemma)

best_params = random_search_lemma.best_params_
best_score = random_search_lemma.best_score_

print(f'Mejores parametros: {best_params}')
print(f'Mejor F1 Score: {best_score}')

best_xgb_model_lemma = random_search_lemma.best_estimator_
best_xgb_model_lemma.fit(X_train_vect_lemma, y_train, early_stopping_rounds=10, eval_set=[(X_test_vect_lemma, y_test)])
y_pred_lemma = best_xgb_model.predict(X_test_vect_lemma)

f1_lemma = f1_score(y_test, y_pred_lemma)
print(f'F1 Score: {f1_lemma}')

current_date = datetime.datetime.now().strftime('%Y-%m-%d')

joblib.dump(random_search_lemma, f'Modelos/xgb_model_lemmatizacion_random_search_{current_date}.joblib')
joblib.dump(best_xgb_model_lemma, f'Modelos/xgb_model_lemmatizacion_{current_date}.joblib')

conjunto_test = conjunto_test.set_index(conjunto_test['ID'])
X_conjunto_test_lemma = vectorizer_lemma.transform(conjunto_test['review_es'])

pred_test_lemma = best_xgb_model_lemma.predict(X_conjunto_test_lemma)
pred_test_labels_lemma = ['positivo' if pred == 1 else 'negativo' for pred in pred_test_lemma]

final_pred_df_lemma = pd.DataFrame({
    'ID': conjunto_test.index,
    'sentimiento': pred_test_labels_lemma
})

final_pred_df_lemma.to_csv(f"Predicciones/XGBoost_lemmatizacion_{current_date}.csv", index=False)

# 4. Red Neuronal aplicando Keras y Tensor Flow

# 5. Ensamble de 3 modelos (o mas)