In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
# 2. Entendimiento de los datos
# Carga del dataset
df_train = pd.read_csv("C:/Users/Dragu/PycharmProjects/proyectoIA/src/archive/Train.csv")
df_test = pd.read_csv("C:/Users/Dragu/PycharmProjects/proyectoIA/src/archive/Test.csv")
df_valid = pd.read_csv("C:/Users/Dragu/PycharmProjects/proyectoIA/src/archive/Valid.csv")


In [3]:

# Unir los datasets en uno solo
df_review = pd.concat([df_train, df_test, df_valid], ignore_index=True)

# Mostrar información del dataset
print(df_review.info())
print(df_review.head())
# Ver nombres de las columnas antes de usarlas
print(df_review.columns)
print(df_review['label'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    50000 non-null  object
 1   label   50000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 781.4+ KB
None
                                                text  label
0  I grew up (b. 1965) watching and loving the Th...      0
1  When I put this movie in my DVD player, and sa...      0
2  Why do people who do not know what a particula...      0
3  Even though I have great interest in Biblical ...      0
4  Im a die hard Dads Army fan and nothing will e...      1
Index(['text', 'label'], dtype='object')
label
0    25000
1    25000
Name: count, dtype: int64


In [4]:

# 3. Preparación de los datos
# Balanceo de datos usando RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_bal, y_bal = rus.fit_resample(df_review[['text']], df_review['label'])

# Convertimos a DataFrame
df_review_bal = pd.DataFrame({'review': X_bal['text'], 'sentiment': y_bal})

print(df_review_bal['sentiment'].value_counts())

# División en train y test
train, test = train_test_split(df_review_bal, test_size=0.33, random_state=42)
train_x, train_y = train['review'], train['sentiment']
test_x, test_y = test['review'], test['sentiment']

# Vectorización TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
train_x_vector = tfidf.fit_transform(train_x)
test_x_vector = tfidf.transform(test_x)

sentiment
0    25000
1    25000
Name: count, dtype: int64


In [None]:

# 4. Modelado
models = {
    "SVM": SVC(kernel='linear'),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression()
}

for name, model in models.items():
    model.fit(train_x_vector, train_y)  # Entrenamos el modelo
    accuracy = model.score(test_x_vector, test_y)  # Evaluamos el modelo
    print(f"{name} Accuracy: {accuracy:.4f}")

In [None]:

# 5. Evaluación
svc = models["SVM"]
predictions = svc.predict(test_x_vector)
print(classification_report(test_y, predictions))


In [None]:
conf_mat = confusion_matrix(test_y, predictions, labels=[1, 0])  # Si las etiquetas son 1 y 0
sns.heatmap(conf_mat, annot=True, fmt="d", cmap="Blues", xticklabels=['positive', 'negative'], yticklabels=['positive', 'negative'])
plt.xlabel("Predicción")
plt.ylabel("Real")
plt.title("Matriz de Confusión")
plt.show()


In [None]:
import joblib

# Guardar el modelo SVM entrenado sin GridSearch
joblib.dump(models["SVM"], "modelo_svm.pkl")

# Guardar el vectorizador TF-IDF
joblib.dump(tfidf, "vectorizador_tfidf.pkl")
