In [152]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
from scipy.stats import norm, skew
from scipy import stats
import matplotlib.pyplot as plt
import plotly.express as px

# machine learning
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

## scikit modeling libraries
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                             GradientBoostingClassifier, ExtraTreesClassifier,
                             VotingClassifier)

from sklearn.model_selection import (GridSearchCV, cross_val_score, cross_val_predict,
                                     StratifiedKFold, learning_curve)

## Load metrics for predictive modeling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc

# Preprocessing
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.preprocessing import LabelEncoder

## Warnings and other tools
import itertools
import warnings
warnings.filterwarnings("ignore")

# Funciones
# from utils.funciones import rmsle_cv

In [153]:
#Cargar el archivo csv
path = 'C:/THE BRIDGE_GIT/Machine_Learning/anime_score_detection_model_regresion/src/'
train = pd.read_csv(path + "data/processed/anime.csv")

In [154]:
train.rename(columns={'Unnamed: 0':'anime_id'}, inplace=True)

In [155]:
train.head(2)

Unnamed: 0,anime_id,MAL_ID,Genres,Type,Episodes,Studios,Rating,Start_season,Score,range_episodes
0,0,1,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,Sunrise,R - 17+ (violence & profanity),Spring,8.78,[25 - 37]
1,1,5,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,Bones,R - 17+ (violence & profanity),Summer,8.39,[1 - 13]


In [156]:
train['Start_season'].unique()

array(['Spring', 'Summer', 'Fall', 'Winter'], dtype=object)

### Genres

In [157]:
train['anime_ids'] = train.index

In [158]:
cleaning = train["Genres"].str.split(", ",12, expand=True)
cleaning.columns = ['Genre_0', 'Genre_1','Genre_2', 'Genre_3','Genre_4', 'Genre_5','Genre_6', 'Genre_7','Genre_8', 'Genre_9', 'Genre_10', 'Genre_11', 'Genre_12']
train_1 = pd.concat([train['anime_id'], cleaning], axis=1)

In [159]:
train_unpivoted = train_1.melt(id_vars=['anime_id'], var_name='Type_Genre', value_name='Genre')
train_unpivoted=train_unpivoted.assign(Value=1)
#Eliminar columnas que no se usarán para el análisis
train_unpivoted.drop(['Type_Genre'] , axis = 1 , inplace = True)
train_unpivoted.head(3)

Unnamed: 0,anime_id,Genre,Value
0,0,Action,1
1,1,Action,1
2,2,Action,1


In [160]:
train_pivoted = train_unpivoted.pivot_table(index=['anime_id'],columns=['Genre'],aggfunc='count',fill_value=0)
train_pivoted.columns = train_pivoted.columns.droplevel(0) #remove amount
train_pivoted = train_pivoted.reset_index().rename_axis(None, axis=1)
train_pivoted.head(3)

Unnamed: 0,anime_id,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,1,1,0,1,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
2,2,1,1,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [161]:
train_2 = pd.merge(train, train_pivoted, on='anime_id', how='outer')
#Eliminar columnas que no se usarán para el análisis
train_2.drop(['Genres'] , axis = 1 , inplace = True)
train_2.head(2)

Unnamed: 0,anime_id,MAL_ID,Type,Episodes,Studios,Rating,Start_season,Score,range_episodes,anime_ids,...,Shounen Ai,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0,1,TV,26,Sunrise,R - 17+ (violence & profanity),Spring,8.78,[25 - 37],0,...,0,0,1,0,0,0,0,0,0,0
1,1,5,Movie,1,Bones,R - 17+ (violence & profanity),Summer,8.39,[1 - 13],1,...,0,0,1,0,0,0,0,0,0,0


### Studios

In [162]:
cleaning = train["Studios"].str.split(", ",6, expand=True)
cleaning.columns = ['Studio_0', 'Studio_1','Studio_2', 'Studio_3','Studio_4', 'Studio_5','Studio_6']
train_1 = pd.concat([train['anime_id'], cleaning], axis=1)

In [163]:
train_unpivoted = train_1.melt(id_vars=['anime_id'], var_name='Type_Studios', value_name='Studio')
train_unpivoted=train_unpivoted.assign(Value=1)
#Eliminar columnas que no se usarán para el análisis
train_unpivoted.drop(['Type_Studios'] , axis = 1 , inplace = True)
train_unpivoted.head(3)

Unnamed: 0,anime_id,Studio,Value
0,0,Sunrise,1
1,1,Bones,1
2,2,Madhouse,1


In [164]:
train_pivoted = train_unpivoted.pivot_table(index=['anime_id'],columns=['Studio'],aggfunc='count',fill_value=0)
train_pivoted.columns = train_pivoted.columns.droplevel(0) #remove amount
train_pivoted = train_pivoted.reset_index().rename_axis(None, axis=1)
train_pivoted.head(3)

Unnamed: 0,anime_id,10Gauge,2:10 AM Animation,3xCube,81 Produce,8bit,A-1 Pictures,A-Real,A.C.G.T.,ACC Production,...,feel.,helo.inc,iDRAGONS Creative Studio,ixtl,l-a-unch・BOX,monofilmo,pH Studio,production doA,teamKG,ufotable
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [165]:
train_3 = pd.merge(train_2, train_pivoted, on='anime_id', how='outer')
#Eliminar columnas que no se usarán para el análisis
train_3.drop(['Studios'] , axis = 1 , inplace = True)
train_3.head(2)

Unnamed: 0,anime_id,MAL_ID,Type,Episodes,Rating,Start_season,Score,range_episodes,anime_ids,Action,...,feel.,helo.inc,iDRAGONS Creative Studio,ixtl,l-a-unch・BOX,monofilmo,pH Studio,production doA,teamKG,ufotable
0,0,1,TV,26,R - 17+ (violence & profanity),Spring,8.78,[25 - 37],0,1,...,0,0,0,0,0,0,0,0,0,0
1,1,5,Movie,1,R - 17+ (violence & profanity),Summer,8.39,[1 - 13],1,1,...,0,0,0,0,0,0,0,0,0,0


### Otras variables categóricas

In [166]:
train_3 = train_3[['Start_season','Type','Episodes','Rating','Score']]
train_3.head(2)

Unnamed: 0,Start_season,Type,Episodes,Rating,Score
0,Spring,TV,26,R - 17+ (violence & profanity),8.78
1,Summer,Movie,1,R - 17+ (violence & profanity),8.39


In [167]:
train_3.dtypes

Start_season     object
Type             object
Episodes          int64
Rating           object
Score           float64
dtype: object

In [168]:
# Para convertir las categóricas en binarias debemos pasarla primero a booleanas
# evaluando si son del tipo object o category
categorical_feature = (train_3.dtypes == "category") | (train_3.dtypes == object)
categorical_feature

Start_season     True
Type             True
Episodes        False
Rating           True
Score           False
dtype: bool

In [169]:
categorical_cols = train_3.columns[categorical_feature].tolist()
categorical_cols

['Start_season', 'Type', 'Rating']

In [170]:
for c in categorical_cols:
    lbl = LabelEncoder() 
    lbl.fit(list(train_3[c].values)) 
    train_3[c] = lbl.transform(list(train_3[c].values))


In [171]:
# # Ver un regisro
# is_male = train_3.loc[:, 'anime_id'] == 1
# df_male = train_3.loc[is_male]
# df_male.head()

In [172]:
train_3.dtypes

Start_season      int32
Type              int32
Episodes          int64
Rating            int32
Score           float64
dtype: object

In [173]:
train_3['Rating'].unique()

array([3, 2, 1, 4, 0, 5])

In [174]:
train_3['Episodes'].max()

373

## Seleccionar Variables

In [175]:
# X = train_4.drop(['Score','anime_id'], axis=1)
X = train_3.drop(['Score'], axis=1)
y = train_3['Score']

## Entrenar el modelo

In [176]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [178]:
## print informacion
print("X_train shape:", X_train.shape, "| X_test shape:", X_test.shape)
print("y_train mean:", round(np.mean(y_train),2), "| y_test mean:", round(np.mean(y_test),2))

X_train shape: (7260, 4) | X_test shape: (1816, 4)
y_train mean: 6.74 | y_test mean: 6.73


In [179]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

**Linear Reggression**

In [180]:
#Linear Regression
lm = LinearRegression()

**Lasso Regression**

In [181]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

**Elastic Net Regression**

In [182]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

**Kernel Ridge Regression**

In [183]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

**Gradient Boosting Regression**

In [184]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

**DecisionTreeRegressor**

In [185]:
Tree = DecisionTreeRegressor()

**RandomForestRegressor**

In [186]:
rf_model = RandomForestRegressor()

**SVR**

In [187]:
SVR_model = SVR()

# Ajuste del mejor modelo y obtención de la matriz de predicciones

In [188]:
lm.fit(X_train.values,y_train.values)
linear_pred = lm.predict(X_test.values)

score = rmsle_cv(lm)
print("\nLinear score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Linear score: 0.8067 (0.0133)



In [189]:
lasso.fit(X_train.values,y_train.values) #
lasso_pred = lasso.predict(X_test.values)

score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.8067 (0.0133)



In [190]:
ENet.fit(X_train.values,y_train.values) #
ENet_pred = ENet.predict(X_test.values)

score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.8067 (0.0133)



In [191]:
KRR.fit(X_train.values,y_train.values) #
KRR_pred = KRR.predict(X_test.values)

score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.7653 (0.0121)



In [192]:
GBoost.fit(X_train.values,y_train.values) #
GBoost_pred = GBoost.predict(X_test.values)

score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.7584 (0.0118)



In [193]:
Tree.fit(X_train.values,y_train.values)
tree_pred = Tree.predict(X_test.values)

score = rmsle_cv(Tree)
print("\nDecisionTree score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


DecisionTree score: 0.8083 (0.0090)



In [194]:
rf_model.fit(X_train.values,y_train.values)
rf_pred = rf_model.predict(X_test.values)

score = rmsle_cv(rf_model)
print("\nRandomForest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


RandomForest score: 0.7785 (0.0106)



In [195]:
SVR_model.fit(X_train.values,y_train.values)
SVR_pred = SVR_model.predict(X_test.values)

score = rmsle_cv(SVR_model)
print("\nSVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


SVR score: 0.7809 (0.0099)



### Métricas del modelo

In [196]:
from sklearn.metrics import mean_absolute_error as mae, mean_absolute_percentage_error as mape
import math 

In [197]:
mae_lr = mae(y_test, linear_pred)
mae_lasso = mae(y_test, lasso_pred)
mae_ENet = mae(y_test, ENet_pred)
mae_KRR = mae(y_test, KRR_pred)
mae_GBoost = mae(y_test, GBoost_pred)
mae_tree = mae(y_test, tree_pred)
mae_rf = mae(y_test, rf_pred)
mae_SRV = mae(y_test, SVR_pred)

In [198]:
mape_lr = mape(y_test, linear_pred) * 100
mape_lasso = mape(y_test, lasso_pred) * 100
mape_ENet = mape(y_test, ENet_pred) * 100
mape_KRR = mape(y_test, KRR_pred) * 100
mape_GBoost = mape(y_test, GBoost_pred) * 100
mape_tree = mape(y_test, tree_pred) * 100
mape_rf = mape(y_test, rf_pred) * 100
mape_SRV = mape(y_test, SVR_pred) * 100

In [199]:
modelCol = ['Linear', 'Lasso', 'ENet', 
            'KRR','Gboost','DecisionTree','RandomForest','SRV']
maeCol = [mae_lr, mae_lasso, mae_ENet, mae_KRR, mae_GBoost, mae_tree, mae_rf, mae_SRV]
mapeCol = [mape_lr, mape_lasso, mape_ENet, mape_KRR, mape_GBoost, mape_tree, mape_rf, mape_SRV]
models = pd.DataFrame({
    'Model': modelCol,
    'MAE': maeCol,
    'MAPE(%)':mapeCol
    })
models

Unnamed: 0,Model,MAE,MAPE(%)
0,Linear,0.63302,9.701335
1,Lasso,0.633038,9.70162
2,ENet,0.633036,9.701592
3,KRR,0.585321,8.965476
4,Gboost,0.575527,8.826529
5,DecisionTree,0.618173,9.434708
6,RandomForest,0.597399,9.134648
7,SRV,0.607269,9.33277


In [200]:
MSE = np.square(np.subtract(y_test,rf_pred)).mean() 
 
RMSE = math.sqrt(MSE)
print("Root Mean Square Error:\n")
print(RMSE)

Root Mean Square Error:

0.7471357451480853
