## Preprocesado

In [54]:
import numpy as np
import pandas as pd

In [55]:
df = pd.read_csv('data/reviews_final.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33754 entries, 0 to 33753
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Id                        33754 non-null  object 
 1   review/score              33754 non-null  float64
 2   review/text               33754 non-null  int64  
 3   Title                     33754 non-null  object 
 4   authors                   33754 non-null  object 
 5   publisher                 33754 non-null  object 
 6   publishedDate             33754 non-null  int64  
 7   categories                33754 non-null  object 
 8   Price                     33754 non-null  float64
 9   conteo                    33754 non-null  int64  
 10  review_helpfulness_up     33754 non-null  int64  
 11  review_helpfulness_total  33754 non-null  int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 3.1+ MB


In [56]:
df.head(3)

Unnamed: 0,Id,review/score,review/text,Title,authors,publisher,publishedDate,categories,Price,conteo,review_helpfulness_up,review_helpfulness_total
0,764229893,5.0,520,Gods and Kings (Chronicles of the Kings #1),Lynn Austin,Bethany House Publishers,2005,Fiction,10.19,315,0,0
1,764229893,5.0,408,Gods and Kings (Chronicles of the Kings #1),Lynn Austin,Bethany House Publishers,2005,Fiction,10.19,315,0,0
2,764229893,5.0,250,Gods and Kings (Chronicles of the Kings #1),Lynn Austin,Bethany House Publishers,2005,Fiction,10.19,315,0,0


### Preprocesado para Precio

In [57]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler

# preprocesado para Precio
pipeline_price = Pipeline([
    (
        'onehot-log',
        ColumnTransformer(
            transformers=[
                (
                    'onehot',
                    OneHotEncoder(
                        sparse_output=False, 
                        dtype=np.uint8, 
                        handle_unknown='ignore'
                        ),
                        ['authors', 'publisher', 'categories']
                ),
                (
                    'log',
                    FunctionTransformer(np.log1p),
                    ['review/text','review_helpfulness_up', 'review_helpfulness_total']
                ),
            ],
            remainder='passthrough',
            verbose_feature_names_out=False
        ).set_output(transform='pandas')
    ),
    (
        'z-score',
        ColumnTransformer(
            transformers=[
                (
                    'z-score',
                    StandardScaler(),
                    ['review/text','review/score', 'conteo', 'review_helpfulness_up', 'review_helpfulness_total', 'publishedDate']
                ),
                (
                    'drop_id_title',
                    'drop',
                    ['Id', 'Title']
                )
            ],
            remainder='passthrough',
            verbose_feature_names_out=False
        ).set_output(transform='pandas')
    )
])

df_cod_price = pipeline_price.fit_transform(df)



In [58]:
df_cod_price.head(5)

Unnamed: 0,review/text,review/score,conteo,review_helpfulness_up,review_helpfulness_total,publishedDate,authors_Arlie Russell Hochschild,authors_Arthur Pike & David Pike,authors_Ashley Audrain,authors_Ben Mezrich,...,categories_Juvenile Fiction,categories_Juvenile Nonfiction,categories_Performing Arts,categories_Political Science,categories_Psychology,categories_Religion,categories_Social Science,categories_True Crime,categories_Young Adult Fiction,Price
0,0.258933,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10.19
1,0.006045,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10.19
2,-0.504113,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10.19
3,-1.673016,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10.19
4,-0.171757,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,10.19


### Preprocesado para Autores

In [66]:
from sklearn.preprocessing import OrdinalEncoder

autores = df['authors'].unique()
pipeline_authors = Pipeline([
    (
        'one-hot-transformer',
        ColumnTransformer(
            transformers=[
                (
                    'onehot',
                    OneHotEncoder(
                        sparse_output=False, 
                        dtype=np.uint8, 
                        handle_unknown='ignore'
                        ),
                        ['publisher', 'categories']
                ),
                (
                    'log',
                    FunctionTransformer(np.log1p),
                    ['review/text','review_helpfulness_up', 'review_helpfulness_total']
                ),
                (
                    'ordinal',
                    OrdinalEncoder(
                        dtype=np.int64,
                        categories=[autores]
                    ),
                    ['authors']
                ),
            ],
            remainder='passthrough',
            verbose_feature_names_out=False
        ).set_output(transform='pandas')
    ),
    (
        'z-score-transformer',
        ColumnTransformer(
            transformers=[
                (
                    'z-score',
                    StandardScaler(),
                    ['review/text', 'Price','review/score', 'conteo', 'review_helpfulness_up', 'review_helpfulness_total', 'publishedDate']
                ),
                (
                    'drop_id_title',
                    'drop',
                    ['Id', 'Title']
                )
            ],
            remainder='passthrough',
            verbose_feature_names_out=False
        ).set_output(transform='pandas')
    )
])

df_cod_authors = pipeline_authors.fit_transform(df)



In [67]:
df_cod_authors.head(5)

Unnamed: 0,review/text,Price,review/score,conteo,review_helpfulness_up,review_helpfulness_total,publishedDate,publisher_Ags Pub,publisher_Amsterdam University Press,publisher_Anchor Canada,...,categories_Juvenile Fiction,categories_Juvenile Nonfiction,categories_Performing Arts,categories_Political Science,categories_Psychology,categories_Religion,categories_Social Science,categories_True Crime,categories_Young Adult Fiction,authors
0,0.258933,-0.523355,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.006045,-0.523355,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.504113,-0.523355,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,-1.673016,-0.523355,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.171757,-0.523355,0.694733,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Preprocesado para Rating

In [68]:
pipeline_rating = Pipeline([
    (
        'one-hot-transformer',
        ColumnTransformer(
            transformers=[
                (
                    'onehot',
                    OneHotEncoder(
                        sparse_output=False, 
                        dtype=np.uint8, 
                        handle_unknown='ignore'
                        ),
                        ['authors', 'publisher', 'categories']
                ),
                (
                    'log',
                    FunctionTransformer(np.log1p),
                    ['review/text','review_helpfulness_up', 'review_helpfulness_total']
                )
            ],
            remainder='passthrough',
            verbose_feature_names_out=False
        ).set_output(transform='pandas')
    ),
    (
        'z-score-transformer',
        ColumnTransformer(
            transformers=[
                (
                    'z-score',
                    StandardScaler(),
                    ['review/text', 'Price', 'conteo', 'review_helpfulness_up', 'review_helpfulness_total', 'publishedDate']
                ),
                (
                    'drop_id_title',
                    'drop',
                    ['Id', 'Title']
                )
            ],
            remainder='passthrough',
            verbose_feature_names_out=False
        ).set_output(transform='pandas')
    )
])

df_cod_rating = pipeline_rating.fit_transform(df)



In [69]:
df_cod_rating.head(5)

Unnamed: 0,review/text,Price,conteo,review_helpfulness_up,review_helpfulness_total,publishedDate,authors_Arlie Russell Hochschild,authors_Arthur Pike & David Pike,authors_Ashley Audrain,authors_Ben Mezrich,...,categories_Juvenile Fiction,categories_Juvenile Nonfiction,categories_Performing Arts,categories_Political Science,categories_Psychology,categories_Religion,categories_Social Science,categories_True Crime,categories_Young Adult Fiction,review/score
0,0.258933,-0.523355,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
1,0.006045,-0.523355,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
2,-0.504113,-0.523355,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
3,-1.673016,-0.523355,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0
4,-0.171757,-0.523355,-0.832488,-0.794639,-0.918971,0.043671,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5.0


## Separacion de los datos de entrenamiento

In [70]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
import time

In [71]:
df_price_train, df_price_val = train_test_split(df_cod_price, test_size=0.25, random_state=42, stratify=df_cod_price['Price'])

In [72]:
X_price_train = df_price_train.drop(columns='Price')
Y_price_train = df_price_train['Price']

X_price_val = df_price_val.drop(columns='Price')
Y_price_val = df_price_val['Price']

### Separacion para Hipotesis Autores

In [73]:
df_authors_train, df_authors_val = train_test_split(df_cod_authors, test_size=0.25, random_state=42, stratify=df_cod_authors['authors'])

In [74]:
X_authors_train = df_authors_train.drop(columns='authors')
Y_authors_train = df_authors_train['authors']

X_authors_val = df_authors_val.drop(columns='authors')
Y_authors_val = df_authors_val['authors']

### Separacion para Hipotesis Rating

In [75]:
df_rating_train, df_rating_val = train_test_split(df_cod_rating, test_size=0.25, random_state=42, stratify=df_cod_rating['review/score'])

In [76]:
X_rating_train = df_rating_train.drop(columns='review/score')
Y_rating_train = df_rating_train['review/score']

X_rating_val = df_rating_val.drop(columns='review/score')
Y_rating_val = df_rating_val['review/score']

### Entrenamiento para Hipotesis Rating

In [77]:
rf_rating = RandomForestClassifier(n_estimators = 120, max_depth = 15, class_weight = 'balanced')
start = time.time()
rf_rating.fit(X_rating_train, Y_rating_train)
end = time.time()
print(f"Tiempo de entrenamiento: {end-start} segundos")

Tiempo de entrenamiento: 5.119294166564941 segundos


In [78]:
predicciones_train_rating = rf_rating.predict(X_rating_train)

In [79]:
accuracy_train_rating = accuracy_score(Y_rating_train, predicciones_train_rating)
print(f'Exactitud en los datos de entrenamiento: {accuracy_train_rating:.2f}')

Exactitud en los datos de entrenamiento: 0.74


In [80]:
predicciones_val_rating = rf_rating.predict(X_rating_val)

In [81]:
accuracy_val_rating = accuracy_score(Y_rating_val, predicciones_val_rating)
print(f'Exactitud en los datos de validacion: {accuracy_val_rating:.2f}')

Exactitud en los datos de validacion: 0.52


### Entrenamiento clasificador para Hipotesis Autores

In [82]:
rf_autores = RandomForestClassifier(n_estimators = 150, max_depth = 12, class_weight = 'balanced')
start = time.time()
rf_autores.fit(X_authors_train, Y_authors_train)
end = time.time()
print(f"Tiempo de entrenamiento: {end-start} segundos")

Tiempo de entrenamiento: 2.418285608291626 segundos


In [83]:
predicciones_train_autores = rf_autores.predict(X_authors_train)

In [84]:
accuracy_train_autores = accuracy_score(Y_authors_train, predicciones_train_autores)
print(f'Exactitud en los datos de entrenamiento: {accuracy_train_autores:.2f}')

Exactitud en los datos de entrenamiento: 1.00


In [85]:
predicciones_val_autores = rf_autores.predict(X_authors_val)
accuracy_val_autores = accuracy_score(Y_authors_val, predicciones_val_autores)
print(f'Exactitud en los datos de validación: {accuracy_val_autores:.2f}')

Exactitud en los datos de validación: 1.00


In [86]:
rf_autores.feature_importances_

array([0.00110548, 0.04669177, 0.00172228, 0.05601922, 0.00236577,
       0.00185244, 0.04978145, 0.00648637, 0.00898969, 0.01266837,
       0.01669761, 0.0132499 , 0.0142815 , 0.0194795 , 0.0126358 ,
       0.01141303, 0.01278674, 0.00965413, 0.02185024, 0.01538853,
       0.0148576 , 0.00639272, 0.00649472, 0.00842776, 0.01055093,
       0.01386195, 0.00994845, 0.00994116, 0.0107663 , 0.01153327,
       0.01806721, 0.0114529 , 0.01313765, 0.01436406, 0.01540305,
       0.01429711, 0.0180577 , 0.00840255, 0.01734568, 0.01347877,
       0.0121847 , 0.01322592, 0.00896447, 0.01376156, 0.01208495,
       0.01539179, 0.0076895 , 0.01150387, 0.01747278, 0.01740476,
       0.01342139, 0.01038055, 0.01749786, 0.01454422, 0.00388403,
       0.01531882, 0.0078195 , 0.00836498, 0.01225163, 0.01177109,
       0.00713742, 0.00908766, 0.01362918, 0.01417152, 0.01167645,
       0.01190123, 0.00631064, 0.01228282, 0.00485491, 0.01229819,
       0.01286216, 0.00965074, 0.00959786, 0.00948085, 0.01021

### Clasificador MLP para Autores

In [87]:
from sklearn.neural_network import MLPClassifier, MLPRegressor

In [88]:
start = time.time()
mlp_autores = MLPClassifier(hidden_layer_sizes=(20,20,20), max_iter=400, activation='logistic', alpha=0.001,  random_state=17)
mlp_autores.fit(X_authors_train, Y_authors_train)
end = time.time()

print(f"Tiempo de entrenamiento: {end-start}")

Tiempo de entrenamiento: 39.23975491523743


In [89]:
predicciones_train_autores_mlp = mlp_autores.predict(X_authors_train)
accuracy_train_autores_mlp = accuracy_score(Y_authors_train, predicciones_train_autores_mlp)
print(f'Exactitud en los datos de entrenamiento: {accuracy_train_autores_mlp:.2f}')

Exactitud en los datos de entrenamiento: 1.00


In [90]:
from sklearn.preprocessing import LabelEncoder
df_cod_ord = df.copy()

# Lista de columnas a codificar
columnas_label = ['Title', 'authors', 'publisher', 'categories']

# Crear una instancia del codificador
label_encoder = LabelEncoder()

# Aplicar Label Encoding a cada columna
for columna in columnas_label:
    df_cod_ord[columna] = label_encoder.fit_transform(df_cod_ord[columna])

# Ver el DataFrame resultante
df_cod_ord.head()

df_cod_ord = df_cod_ord.drop(columns='Id')
correlacion_authors_ord =df_cod_ord.corrwith(df_cod_ord['authors'])
correlacion_authors_ord_df = pd.DataFrame(correlacion_authors_ord, columns=['Correlation']).sort_values('Correlation').dropna()
# Es necesario hacer la copia porque sino salta una excepción al momento de graficar
correlacion_authors_ord_df_abs = correlacion_authors_ord_df.copy()
correlacion_authors_ord_df_abs['Abs_Correlation'] =correlacion_authors_ord_df_abs['Correlation'].abs()
correlacion_authors_ord_df_abs = correlacion_authors_ord_df_abs.sort_values(by='Abs_Correlation', ascending=False)
correlacion_authors_ord_df_abs

Unnamed: 0,Correlation,Abs_Correlation
authors,1.0,1.0
Price,-0.280731,0.280731
conteo,0.252632,0.252632
Title,0.208383,0.208383
publishedDate,-0.16183,0.16183
publisher,-0.116467,0.116467
review/score,-0.108625,0.108625
review/text,-0.039979,0.039979
review_helpfulness_total,-0.037844,0.037844
categories,-0.035867,0.035867


### Entrenamiento para Hipotesis Precio

In [91]:
start = time.time()
mlp_precio = MLPRegressor(hidden_layer_sizes=(20,20,20), max_iter=400, activation='logistic', alpha=0.001,  random_state=17)
mlp_precio.fit(X_price_train, Y_price_train)
end = time.time()

print(f"Tiempo de entrenamiento: {end-start}")

Tiempo de entrenamiento: 27.457914352416992


In [92]:
from sklearn.metrics import mean_squared_error

In [93]:
predicciones_train_price = mlp_precio.predict(X_price_train)
mse_price = mean_squared_error(Y_price_train, predicciones_train_price)
print(f'Error cuadrático medio: {mse_price:.2f}')

Error cuadrático medio: 0.00
