# <center> Trabajo Practico 2 </center>
### <center> Grupo 10 </center>
## Integrantes:
#### Alan Richmond
#### Flavian Ferré
#### Alan Mejia

# 0. Importaciones

## Librerias

In [5]:
import pandas as pd
import datetime
import joblib
import stop_words

from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.ensemble import RandomForestClassifier

## Conjuntos

In [6]:
conjunto_train = pd.read_csv('Datasets/train.csv')
conjunto_test = pd.read_csv('Datasets/test.csv')
sample_solution = pd.read_csv('Datasets/sample_solution.csv')

## Vectorizacion de las críticas

In [7]:
X = conjunto_train['review_es']
y = conjunto_train['sentimiento'].map({'positivo': 1, 'negativo': 0})
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

stop_words_es = stop_words.get_stop_words('es')
vectorizer = TfidfVectorizer(stop_words=stop_words_es)

X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# 1. Bayes Naïve

# 2. Random Forest

## Busqueda de los hiperparametros

In [69]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

rf_model.fit(X_train_vect, y_train)
y_pred = rf_model.predict(X_test_vect)

f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

F1 Score: 0.8146013448607108


## Conjunto test

In [70]:
conjunto_test = conjunto_test.set_index(conjunto_test['ID'])
X_conjunto_test = vectorizer.transform(conjunto_test['review_es'])

pred_test = rf_model.predict(X_conjunto_test)
pred_test_labels = ['positivo' if pred == 1 else 'negativo' for pred in pred_test]

final_pred_df = pd.DataFrame({
    'ID': conjunto_test.index,
    'sentimiento': pred_test_labels
})
final_pred_df

Unnamed: 0,ID,sentimiento
0,60000,negativo
1,60001,negativo
2,60002,negativo
3,60003,negativo
4,60004,positivo
...,...,...
8594,68594,positivo
8595,68595,negativo
8596,68596,positivo
8597,68597,negativo


## Exportaciones

In [71]:
current_date = datetime.datetime.now().strftime('%Y-%m-%d')

final_pred_df.to_csv(f"Predicciones/RandomForest_{current_date}.csv", index=False)

In [72]:
joblib.dump(rf_model, f'Modelos/rf_model_{current_date}.joblib')

['Modelos/rf_model_2024-05-23.joblib']

# 3. XGBoost

## Busqueda de los hiperparametros

In [8]:
param_distributions = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [2, 3, 4, 5, 6],
    'learning_rate': [0, 0.02, 0.04, 0.06, 0.08, 0.1],
    'gamma': [0, 0.5, 1, 2, 3]
}

xgb = XGBClassifier()
f1_scorer = make_scorer(f1_score)

sample_size = int(0.05 * X_train_vect.shape[0])
X_train_sampled = X_train_vect[:sample_size]
y_train_sampled = y_train[:sample_size]

random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_distributions, cv=5, n_iter=20, scoring=f1_scorer, verbose=3, n_jobs=-1)

random_search.fit(X_train_sampled, y_train_sampled)

Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [9]:
best_params = random_search.best_params_
best_score = random_search.best_score_

print(f'Mejores parametros: {best_params}')
print(f'Mejor F1 Score: {best_score}')

Mejores parametros: {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.1, 'gamma': 0}
Mejor F1 Score: 0.8046269701789626


In [10]:
best_xgb_model = random_search.best_estimator_
best_xgb_model.fit(X_train_vect, y_train)
y_pred = best_xgb_model.predict(X_test_vect)

f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1}')

F1 Score: 0.8357129259470778


## Conjunto test

In [12]:
conjunto_test = conjunto_test.set_index(conjunto_test['ID'])
X_conjunto_test = vectorizer.transform(conjunto_test['review_es'])

pred_test = best_xgb_model.predict(X_conjunto_test)
pred_test_labels = ['positivo' if pred == 1 else 'negativo' for pred in pred_test]

final_pred_df = pd.DataFrame({
    'ID': conjunto_test.index,
    'sentimiento': pred_test_labels
})
final_pred_df

Unnamed: 0,ID,sentimiento
0,60000,negativo
1,60001,positivo
2,60002,negativo
3,60003,negativo
4,60004,negativo
...,...,...
8594,68594,positivo
8595,68595,positivo
8596,68596,positivo
8597,68597,negativo


## Exportaciones

In [13]:
current_date = datetime.datetime.now().strftime('%Y-%m-%d')

final_pred_df.to_csv(f"Predicciones/XGBoost_{current_date}.csv", index=False)

In [14]:
joblib.dump(best_xgb_model, f'Modelos/xgb_model_{current_date}.joblib')

['Modelos/xgb_model_2024-05-24.joblib']

# 4. Red Neuronal aplicando Keras y Tensor Flow

# 5. Ensamble de 3 modelos (o mas)