In [54]:
# data analysis and wrangling
import pandas as pd
import numpy as np
import random as rnd

# visualization
import seaborn as sns
from scipy.stats import norm, skew
from scipy import stats
import matplotlib.pyplot as plt
import plotly.express as px

# machine learning
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

## scikit modeling libraries
from sklearn.ensemble import (RandomForestClassifier, AdaBoostClassifier,
                             GradientBoostingClassifier, ExtraTreesClassifier,
                             VotingClassifier)

from sklearn.model_selection import (GridSearchCV, cross_val_score, cross_val_predict,
                                     StratifiedKFold, learning_curve)

## Load metrics for predictive modeling
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, r2_score
from sklearn.metrics import roc_curve, auc

# Preprocessing
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.preprocessing import LabelEncoder

## Warnings and other tools
import itertools
import warnings
warnings.filterwarnings("ignore")

# Funciones
# from utils.funciones import rmsle_cv

In [9]:
#Cargar el archivo csv
path = 'C:/THE BRIDGE_GIT/Machine_Learning/anime_score_detection_model_regresion/src/'
train = pd.read_csv(path + "data/processed/anime.csv")

In [10]:
train.head(2)

Unnamed: 0.1,Unnamed: 0,Type,Episodes,Rating,Start_season,Score
0,0,TV,26,R - 17+ (violence & profanity),Spring,8.78
1,1,Movie,1,R - 17+ (violence & profanity),Summer,8.39


In [11]:
train.drop(['Unnamed: 0'] , axis = 1 , inplace = True)

### Otras variables categóricas

In [12]:
categorical_feature = (train.dtypes == "category") | (train.dtypes == object)
categorical_cols = train.columns[categorical_feature].tolist()

In [13]:
categorical_cols

['Type', 'Rating', 'Start_season']

In [14]:
train.dtypes

Type             object
Episodes          int64
Rating           object
Start_season     object
Score           float64
dtype: object

In [15]:
# Para convertir las categóricas en binarias debemos pasarla primero a booleanas
# evaluando si son del tipo object o category
categorical_feature = (train.dtypes == "category") | (train.dtypes == object)
categorical_feature

Type             True
Episodes        False
Rating           True
Start_season     True
Score           False
dtype: bool

In [16]:
categorical_cols = train.columns[categorical_feature].tolist()
categorical_cols

['Type', 'Rating', 'Start_season']

In [21]:
for c in categorical_cols:
    lbl = LabelEncoder() 
    lbl.fit(list(train[c].values)) 
    train[c] = lbl.transform(list(train[c].values))


In [22]:
train

Unnamed: 0,Type,Episodes,Rating,Start_season,Score
0,5,26,3,1,8.780000
1,0,1,3,2,8.390000
2,5,26,2,1,8.240000
3,5,26,2,2,7.270000
4,5,52,1,0,6.980000
...,...,...,...,...,...
9013,2,40,2,3,6.515152
9014,1,1,1,3,6.410000
9015,1,1,0,3,7.520000
9016,4,1,3,3,4.810000


In [24]:
train.dtypes

Type              int64
Episodes          int64
Rating            int64
Start_season      int64
Score           float64
dtype: object

In [25]:
train['Rating'].unique()

array([3, 2, 1, 4, 0, 5], dtype=int64)

In [26]:
train['Episodes'].max()

140

## Seleccionar Variables

In [27]:
# X = train_4.drop(['Score','anime_id'], axis=1)
X = train.drop(['Score'], axis=1)
y = train['Score']

## Entrenar el modelo

In [28]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
import xgboost as xgb
import lightgbm as lgb

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42)

In [30]:
## print informacion
print("X_train shape:", X_train.shape, "| X_test shape:", X_test.shape)
print("y_train mean:", round(np.mean(y_train),2), "| y_test mean:", round(np.mean(y_test),2))

X_train shape: (7214, 4) | X_test shape: (1804, 4)
y_train mean: 6.74 | y_test mean: 6.74


In [31]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

**Linear Reggression**

In [32]:
#Linear Regression
lm = LinearRegression()

**Lasso Regression**

In [33]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

**Elastic Net Regression**

In [34]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

**Kernel Ridge Regression**

In [35]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

**Gradient Boosting Regression**

In [36]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

**DecisionTreeRegressor**

In [37]:
Tree = DecisionTreeRegressor()

**RandomForestRegressor**

In [38]:
rf_model = RandomForestRegressor()

**SVR**

In [39]:
SVR_model = SVR()

# Ajuste del mejor modelo y obtención de la matriz de predicciones

In [40]:
lm.fit(X_train.values,y_train.values)
linear_pred = lm.predict(X_test.values)

score = rmsle_cv(lm)
print("\nLinear score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Linear score: 0.7950 (0.0250)



In [41]:
lasso.fit(X_train.values,y_train.values) #
lasso_pred = lasso.predict(X_test.values)

score = rmsle_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


Lasso score: 0.7950 (0.0250)



In [42]:
ENet.fit(X_train.values,y_train.values) #
ENet_pred = ENet.predict(X_test.values)

score = rmsle_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.7950 (0.0250)



In [43]:
KRR.fit(X_train.values,y_train.values) #
KRR_pred = KRR.predict(X_test.values)

score = rmsle_cv(KRR)
print("Kernel Ridge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Kernel Ridge score: 0.7543 (0.0222)



In [44]:
GBoost.fit(X_train.values,y_train.values) #
GBoost_pred = GBoost.predict(X_test.values)

score = rmsle_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting score: 0.7423 (0.0200)



In [45]:
Tree.fit(X_train.values,y_train.values)
tree_pred = Tree.predict(X_test.values)

score = rmsle_cv(Tree)
print("\nDecisionTree score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


DecisionTree score: 0.7925 (0.0165)



In [46]:
rf_model.fit(X_train.values,y_train.values)
rf_pred = rf_model.predict(X_test.values)

score = rmsle_cv(rf_model)
print("\nRandomForest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


RandomForest score: 0.7623 (0.0205)



In [47]:
SVR_model.fit(X_train.values,y_train.values)
SVR_pred = SVR_model.predict(X_test.values)

score = rmsle_cv(SVR_model)
print("\nSVR score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


SVR score: 0.7629 (0.0238)



### Métricas del modelo

In [48]:
from sklearn.metrics import mean_absolute_error as mae, mean_absolute_percentage_error as mape
import math 

In [49]:
mae_lr = mae(y_test, linear_pred)
mae_lasso = mae(y_test, lasso_pred)
mae_ENet = mae(y_test, ENet_pred)
mae_KRR = mae(y_test, KRR_pred)
mae_GBoost = mae(y_test, GBoost_pred)
mae_tree = mae(y_test, tree_pred)
mae_rf = mae(y_test, rf_pred)
mae_SRV = mae(y_test, SVR_pred)

In [50]:
mape_lr = mape(y_test, linear_pred) * 100
mape_lasso = mape(y_test, lasso_pred) * 100
mape_ENet = mape(y_test, ENet_pred) * 100
mape_KRR = mape(y_test, KRR_pred) * 100
mape_GBoost = mape(y_test, GBoost_pred) * 100
mape_tree = mape(y_test, tree_pred) * 100
mape_rf = mape(y_test, rf_pred) * 100
mape_SRV = mape(y_test, SVR_pred) * 100

In [51]:
modelCol = ['Linear', 'Lasso', 'ENet', 
            'KRR','Gboost','DecisionTree','RandomForest','SRV']
maeCol = [mae_lr, mae_lasso, mae_ENet, mae_KRR, mae_GBoost, mae_tree, mae_rf, mae_SRV]
mapeCol = [mape_lr, mape_lasso, mape_ENet, mape_KRR, mape_GBoost, mape_tree, mape_rf, mape_SRV]
models = pd.DataFrame({
    'Model': modelCol,
    'MAE': maeCol,
    'MAPE(%)':mapeCol
    })
models

Unnamed: 0,Model,MAE,MAPE(%)
0,Linear,0.645885,10.004634
1,Lasso,0.64586,10.00431
2,ENet,0.645862,10.004329
3,KRR,0.60871,9.459357
4,Gboost,0.612712,9.535679
5,DecisionTree,0.65302,10.132954
6,RandomForest,0.633622,9.837793
7,SRV,0.619097,9.629009


In [53]:
# MSE = np.square(np.subtract(y_test,rf_pred)).mean() 
mse = mean_squared_error(y_test, rf_pred)
 
RMSE = math.sqrt(mse)
print("Mean Square Error:\n")
print(mse)
print("Root Mean Square Error:\n")
print(RMSE)

Mean Square Error:

0.6436944376082759
Root Mean Square Error:

0.802305700844931


In [60]:
# Medición de ajuste en datos de entrenamiento
r2_score(y_train.values,rf_pred)

AttributeError: 'numpy.ndarray' object has no attribute 'values'