### Tabla de contenidos

1. [**Importación de librerías**](#1.-Importación-de-librerías)   
2. [**Cargo los datos**](#2.-Cargo-los-datos) 
3. [**Limpieza de los datos**](#3.-Limpieza-de-los-datos)  
    3.1 [**Checkeo los duplicados**](#3.1-Checkeo-los-duplicados)  
    3.2 [**Compruebo si existen reseñas vacías**](#3.2-Compruebo-si-existen-reseñas-vacías)  
    3.3 [**Distribución de los datos**](#3.3-Distribución-de-los-datos)
4. [**Preprocesamiento del texto**](#4.-Preprocesamiento-del-texto)  
5. [**Normalización de los datos**](#5.-Normalización-de-los-datos)    
    5.1 [**Normalizo el puntaje de las reseñas para que varíe entre 0 y 4**](#5.1-Normalizo-el-puntaje-de-las-reseñas-para-que-varíe-entre-0-y-4)  
    5.2 [**Elimino columnas que no van a ser utilizadas**](#5.2-Elimino-columnas-que-no-van-a-ser-utilizadas)  
    5.3 [**Elimino textos que pueden haber quedado vacíos luego de preprocesar el texto**](#5.3-Elimino-textos-que-pueden-haber-quedado-vacíos-luego-de-preprocesar-el-texto)   
6. [**Extracción de características**](#6.-Extracción-de-características)   
    6.1 [**Generación de vectores**](#6.1-Generación-de-vectores)  
    6.2 [**Separo los datos en dos conjuntos: train y test (80/20)**](#6.2-Separo-los-datos-en-dos-conjuntos:-train-y-test-(80/20))
7. [**Creación del modelo**](#7.-Creación-del-modelo)   
    7.1 [**Regresión logística**](#7.1-Regresión-logística)  
    7.2 [**SVM (Support Vector Machine)**](#7.2-SVM-(Support-Vector-Machine))  
    7.2.1 [**SVM One-to-One**](#7.2.1-SVM-One-to-One)  
    7.2.2 [**SVM One-to-Rest**](#7.2.2-SVM-One-to-Rest)  
    7.3 [**Naive Bayes**](#7.3-Naive-Bayes)  
    7.3.1 [**GaussianNB**](#7.3.1-GaussianNB)  
    7.3.2 [**BernoulliNB**](#7.3.2-BernoulliNB)  
    7.4 [**Random Forest**](#7.4-Random-Forest)  

### 1. Importación de librerías

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import spacy
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from bs4 import BeautifulSoup

### 2. Cargo los datos

In [None]:
df = pd.read_csv("/kaggle/input/amazon-product-reviews/Reviews.csv")
print("Tamaño de los datos: ", df.shape)

In [None]:
df.head()

### 3. Limpieza de los datos

#### 3.1 Checkeo los duplicados

In [None]:
df=df.sort_values('ProductId', kind='quicksort', na_position='last')

In [None]:
df=df.drop_duplicates(subset={"Text"}, keep='first', inplace=False)
df.shape

#### 3.2 Compruebo si existen reseñas vacías

In [None]:
print(df['Text'].isnull().sum())
df['Score'].isnull().sum()

#### 3.3 Distribución de los datos

In [None]:
df['Score'].value_counts()

In [None]:
plt.figure(figsize = (10,7))
sns.countplot(df['Score'])
plt.title("Distribución de la puntuación")

### 4. Preprocesamiento del texto

In [None]:
def decontract(text):
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

In [None]:
stop_words = set(stopwords.words('english'))
negative_stop_words = set(word for word in stop_words if "n't" in word or 'no' in word)
stop_words = stop_words - negative_stop_words

In [None]:
lemmatizer = WordNetLemmatizer()
def preprocess_text(review):
    review = re.sub(r"http\S+", "", review)             # removing website links
    review = BeautifulSoup(review, 'lxml').get_text()   # removing html tags
    review = decontract(review)                         # decontracting
    review = re.sub("\S*\d\S*", "", review).strip()     # removing the words with numeric digits
    review = re.sub('[^A-Za-z]+', ' ', review)          # removing non-word characters
    review = review.lower()                             # converting to lower case
    review = [word for word in review.split(" ") if not word in stop_words] # removing stop words
    review = [lemmatizer.lemmatize(token, "v") for token in review] #lemmatization
    review = " ".join(review)
    review.strip()
    return review
df['Text'] = df['Text'].apply(lambda x: preprocess_text(x))

En caso de que se quiera utilizar stemming en vez de lematización, se puede ejecutar la siguiente celda

In [None]:
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer(language='english')
def preprocess_text(review):
    review = re.sub(r"http\S+", "", review)             # removing website links
    review = BeautifulSoup(review, 'lxml').get_text()   # removing html tags
    review = decontract(review)                         # decontracting
    review = re.sub("\S*\d\S*", "", review).strip()     # removing the words with numeric digits
    review = re.sub('[^A-Za-z]+', ' ', review)          # removing non-word characters
    review = review.lower()                             # converting to lower case
    review = [word for word in review.split(" ") if not word in stop_words] # removing stop words
    review = [stemmer.stem(token) for token in review] #stemming
    review = " ".join(review)
    review.strip()
    return review
df['Text'] = df['Text'].apply(lambda x: preprocess_text(x))

In [None]:
df['Text'].head()

### 5. Normalización de los datos

#### 5.1 Normalizo el puntaje de las reseñas para que varíe entre 0 y 4

In [None]:
def normalize(score):
    return score - 1

In [None]:
df["Score"] = df["Score"].apply(normalize)

#### 5.2 Elimino columnas que no van a ser utilizadas

In [None]:
df = df.drop(['Id', 'ProductId', 'UserId', 'ProfileName', 'HelpfulnessNumerator', 'HelpfulnessDenominator', 'Time', 'Summary'], axis=1)

#### 5.3 Elimino textos que pueden haber quedado vacíos luego de preprocesar el texto

In [None]:
df['Text'].replace('', np.nan, inplace=True)
df.dropna(subset=['Text'], inplace=True)

In [None]:
df = df.sample(frac=1).reset_index(drop=True)
df.head()

### 6. Extracción de características

#### 6.1 Generación de vectores

Para generar los vectores utilizaremos una funcionalidad que nos ofrece SpaCy de manera sencilla utilizando las word embeddings ya preentrenadas como word2vec y GloVe

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
with nlp.disable_pipes():
    vectors = np.array([nlp(review.Text).vector for idx, review in df.iterrows()])
    
vectors.shape

Con las siguientes celdas se pueden guardar los vectores y el dataset procesado

In [None]:
from numpy import save
save('vectors.npy', vectors)

In [None]:
from numpy import load
vectors = load('/kaggle/input/lemmatized/vectors.npy')

In [None]:
df.to_csv('ReviewsLemmatized.csv',index=False)

#### 6.2 Separo los datos en dos conjuntos: train y test (80/20)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(vectors, df.Score, test_size=0.2, random_state=1)

### 7. Creación del modelo

#### 7.1 Regresión logística

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(max_iter=100000)

lr.fit(X_train,y_train)
print(f'Model test accuracy: {lr.score(X_test, y_test)*100:.3f}%')

#### 7.2 SVM (Support Vector Machine)

##### 7.2.1 SVM One-to-One

In [None]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier

model2 = OneVsOneClassifier(LinearSVC(random_state=1, dual=False))
model2.fit(X_train, y_train)

print(f'Model test accuracy: {model2.score(X_test, y_test)*100:.3f}%')

##### 7.2.2 SVM One-to-Rest

In [None]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

model3 = OneVsRestClassifier(LinearSVC(random_state=1, dual=False))
model3.fit(X_train, y_train)

print(f'Model test accuracy: {model3.score(X_test, y_test)*100:.3f}%')

#### 7.3 Naive Bayes

##### 7.3.1 GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)

print(f'Model test accuracy: {gnb.score(X_test, y_test)*100:.3f}%')

##### 7.3.2 BernoulliNB

In [None]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)

print(f'Model test accuracy: {bnb.score(X_test, y_test)*100:.3f}%')

#### 7.4 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X_train, y_train)

print(f'Model test accuracy: {clf.score(X_test, y_test)*100:.3f}%')