In [None]:
# Importamos las librerías necesarias
import pandas as pd
import numpy as np
import string
import re
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

df = pd.read_csv('/content/drive/MyDrive/Diplomado IA/IMDB Dataset.csv')


print(df.head())

In [None]:
#Limpieza de datos#
def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()  # Elimina HTML
    text = re.sub(r'[^\w\s]', '', text)  # Elimina puntuación
    text = text.lower()  # Convierte a minúsculas
    return text

df['review'] = df['review'].apply(clean_text)
print("Datos después de la limpieza de texto:")
print(df.head())

In [None]:
# MODELO
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

X = df['review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

modelo = LogisticRegression()
modelo.fit(X_train_tfidf, y_train)

puntajes_validacion_cruzada = cross_val_score(modelo, X_train_tfidf, y_train, cv=5)
print("Puntajes de Validación Cruzada:", puntajes_validacion_cruzada)
print("Puntaje Promedio de Validación Cruzada:", np.mean(puntajes_validacion_cruzada))

y_prediccion = modelo.predict(X_test_tfidf)

exactitud = accuracy_score(y_test, y_prediccion)
informe_clasificacion = classification_report(y_test, y_prediccion)
matriz_confusion = confusion_matrix(y_test, y_prediccion)

plt.figure(figsize=(8, 6))
sns.heatmap(matriz_confusion, annot=True, fmt="d", cmap="Blues", cbar=False)
plt.xlabel("Predicho")
plt.ylabel("Real")
plt.title("Matriz de Confusión")
plt.show()



In [None]:
#PRUEBA
precision_modelo = accuracy_score(y_test, y_prediccion)


if precision_modelo > 0.82:
    print("El modelo cumple con el requisito de precisión (> 0.82).")
else:
    print("El modelo no cumple con el requisito de precisión (> 0.82).")

El modelo cumple con el requisito de precisión (> 0.82).


## PICKLE

In [None]:
import pickle

In [None]:

with open('logistic_model.pkl', 'wb') as model_file:
    pickle.dump(modelo, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(tfidf_vectorizer, vectorizer_file)

In [None]:
#STREAMLIT
! pip install streamlit

In [None]:
pip install streamlit -q

In [None]:
pip install streamlit-lottie

In [None]:

import streamlit as st
import numpy as np
import pickle
from bs4 import BeautifulSoup
import re

# Cargar el modelo y el vectorizador TF-IDF
with open('logistic_model.pkl', 'rb') as model_file:
    modelo = pickle.load(model_file)

with open('tfidf_vectorizer.pkl', 'rb') as vectorizer_file:
    tfidf_vectorizer = pickle.load(vectorizer_file)

st.title("Predict Reviews' Sentiment")

# Entrada de la reseña
review = st.text_input("Ingrese la reseña:")

if st.button("Predicción"):
    # Preprocesar y vectorizar la reseña
    review_cleaned = BeautifulSoup(review, "html.parser").get_text()
    review_cleaned = re.sub(r'[^\w\s]', '', review_cleaned)
    review_cleaned = review_cleaned.lower()
    review_vectorized = tfidf_vectorizer.transform([review_cleaned])

    # Predicción
    pred = modelo.predict(review_vectorized)[0]

    if pred == 0:
        st.write("La reseña es Negativa")
    else:
        st.write("La reseña es Positiva")

In [None]:
! wget -q -O - ipv4.icanhazip.com