In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("data/processed_anime.csv")
df

Unnamed: 0,ID,Score,Episodes,Duration_Minutes,Popularity,Rating_G - All Ages,Rating_None,Rating_PG - Children,Rating_PG-13 - Teens 13 or older,Rating_R - 17+ (violence & profanity),Rating_R+ - Mild Nudity,Rating_Rx - Hentai,Producers_closeness_centrality,Producers_betweenness_centrality,Producers_degree_centrality,Producers_clustering_coefficient,Themes_closeness_centrality,Themes_betweenness_centrality,Themes_degree_centrality,Themes_clustering_coefficient
0,1,8.751,26.0,24.0,43.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.396790,0.022269,0.142857,0.164474,0.689394,0.014594,0.540816,0.626018
1,5,8.381,1.0,115.0,572.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.352391,0.011360,0.077580,0.275047,0.689394,0.014594,0.540816,0.626018
2,6,8.221,26.0,24.0,243.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.349975,0.002874,0.038961,0.361529,0.742424,0.023783,0.653061,0.528226
3,7,7.261,26.0,25.0,1709.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.393878,0.022121,0.136535,0.199694,0.680556,0.008731,0.530612,0.689231
4,15,7.921,145.0,23.0,1181.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.384544,0.015058,0.111415,0.228647,0.612500,0.005015,0.367347,0.620915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5450,52335,6.071,1.0,3.0,17503.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.000684,0.000000,0.000684,0.000000,0.703490,0.019996,0.530612,0.598516
5451,52573,6.711,1.0,47.0,11263.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.362430,0.004703,0.065619,0.332018,0.661203,0.014622,0.442177,0.563013
5452,52575,7.761,1.0,4.0,11543.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.830508,0.037795,0.795918,0.504723
5453,52638,5.081,1.0,3.0,17677.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.316282,0.000100,0.009569,0.703297,0.685282,0.017386,0.520408,0.646168


# Score Prediction

In [3]:
df.columns

Index(['ID', 'Score', 'Episodes', 'Duration_Minutes', 'Popularity',
       'Rating_G - All Ages', 'Rating_None', 'Rating_PG - Children',
       'Rating_PG-13 - Teens 13 or older',
       'Rating_R - 17+ (violence & profanity)', 'Rating_R+ - Mild Nudity',
       'Rating_Rx - Hentai', 'Producers_closeness_centrality',
       'Producers_betweenness_centrality', 'Producers_degree_centrality',
       'Producers_clustering_coefficient', 'Themes_closeness_centrality',
       'Themes_betweenness_centrality', 'Themes_degree_centrality',
       'Themes_clustering_coefficient'],
      dtype='object')

In [4]:
X = df.drop(columns=['ID', 'Score', 'Popularity', 
                     'Themes_closeness_centrality', 'Themes_betweenness_centrality', 'Themes_degree_centrality', 
                     'Themes_clustering_coefficient'])
y = df['Score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
result = pd.DataFrame(columns=["model", "MAE"])

# linear
lin_reg = LinearRegression().fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
result.loc[len(result)] = ["Linear", mean_absolute_error(y_test, y_pred)]

# # random forest
# rf_reg = RandomForestRegressor(random_state=0).fit(X_train, y_train)
# y_pred = rf_reg.predict(X_test)
# result.loc[len(result)] = ["Random Forest", mean_absolute_error(y_test, y_pred)]

# # gradient boost
# gb_reg = GradientBoostingRegressor().fit(X_train, y_train)
# y_pred = gb_reg.predict(X_test)
# result.loc[len(result)] = ["Gradient Boosting", mean_absolute_error(y_test, y_pred)]

# # xgboost
# xgb_reg = xgb.XGBRegressor().fit(X_train, y_train)
# y_pred = xgb_reg.predict(X_test)
# result.loc[len(result)] = ["XGBoost", mean_absolute_error(y_test, y_pred)]

# fine tuning
parameters = {'n_estimators':[1, 5, 10, 100], 'max_depth':[3, 5, None], "random_state": [0], "criterion": ["absolute_error"]}
rf_reg_tuned = GridSearchCV(RandomForestRegressor(), parameters).fit(X_train, y_train)
y_pred = rf_reg_tuned.predict(X_test)
result.loc[len(result)] = ["Random Forest (tuned)", mean_absolute_error(y_test, y_pred)]

parameters = {'learning_rate':[0.1, 1, 10], 'n_estimators':[1, 5, 10, 100], 'max_depth':[3, 5]}
gb_reg_tuned = GridSearchCV(GradientBoostingRegressor(), parameters).fit(X_train, y_train)
y_pred = gb_reg_tuned.predict(X_test)
result.loc[len(result)] = ["Gradient Boosting (tuned)", mean_absolute_error(y_test, y_pred)]

parameters = {"n_estimators": [50, 100, 150], "random_state": [0]}
xgb_reg_tuned = GridSearchCV(xgb.XGBRegressor(), parameters).fit(X_train,y_train)
y_pred = xgb_reg_tuned.predict(X_test)
result.loc[len(result)] = ["XGBoost (tuned)", mean_absolute_error(y_test, y_pred)]

result.sort_values(by=["MAE"])

  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


Unnamed: 0,model,MAE
1,Random Forest (tuned),0.516168
2,Gradient Boosting (tuned),0.524374
3,XGBoost (tuned),0.525812
0,Linear,0.605234


In [6]:
print(rf_reg_tuned.best_params_)
print(gb_reg_tuned.best_params_)
print(xgb_reg_tuned.best_params_)

{'criterion': 'absolute_error', 'max_depth': None, 'n_estimators': 100, 'random_state': 0}
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
{'n_estimators': 50, 'random_state': 0}


# Popularity Prediction

In [7]:
X = df.drop(columns=['ID', 'Score', 'Popularity', 
                     'Themes_closeness_centrality', 'Themes_betweenness_centrality', 'Themes_degree_centrality', 
                     'Themes_clustering_coefficient'])
y = df['Popularity']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
result = pd.DataFrame(columns=["model", "r2"])

# linear
lin_reg = LinearRegression().fit(X_train, y_train)
y_pred = lin_reg.predict(X_test)
result.loc[len(result)] = ["Linear", r2_score(y_test, y_pred)]

# # random forest
# rf_reg = RandomForestRegressor(random_state=0).fit(X_train, y_train)
# y_pred = rf_reg.predict(X_test)
# result.loc[len(result)] = ["Random Forest", r2_score(y_test, y_pred)]

# # gradient boost
# gb_reg = GradientBoostingRegressor().fit(X_train, y_train)
# y_pred = gb_reg.predict(X_test)
# result.loc[len(result)] = ["Gradient Boosting", r2_score(y_test, y_pred)]

# # xgboost
# xgb_reg = xgb.XGBRegressor().fit(X_train, y_train)
# y_pred = xgb_reg.predict(X_test)
# result.loc[len(result)] = ["XGBoost", r2_score(y_test, y_pred)]

# fine tuning
parameters = {'n_estimators':[1, 5, 10, 100], 'max_depth':[3, 5, None], "random_state": [0], "criterion": ["absolute_error"]}
rf_reg_tuned = GridSearchCV(RandomForestRegressor(), parameters, scoring='r2').fit(X_train, y_train)
y_pred = rf_reg_tuned.predict(X_test)
result.loc[len(result)] = ["Random Forest (tuned)", r2_score(y_test, y_pred)]

parameters = {'learning_rate':[0.1, 1, 10], 'n_estimators':[1, 5, 10, 100], 'max_depth':[3, 5]}
gb_reg_tuned = GridSearchCV(GradientBoostingRegressor(), parameters, scoring='r2').fit(X_train, y_train)
y_pred = gb_reg_tuned.predict(X_test)
result.loc[len(result)] = ["Gradient Boosting (tuned)", r2_score(y_test, y_pred)]

parameters = {"n_estimators": [50, 100, 150], "random_state": [0]}
xgb_reg_tuned = GridSearchCV(xgb.XGBRegressor(), parameters, scoring='r2').fit(X_train,y_train)
y_pred = xgb_reg_tuned.predict(X_test)
result.loc[len(result)] = ["XGBoost (tuned)", r2_score(y_test, y_pred)]

result.sort_values(by=["r2"], ascending=False)

  (array - array_means[:, np.newaxis]) ** 2, axis=1, weights=weights


Unnamed: 0,model,r2
2,Gradient Boosting (tuned),0.653727
3,XGBoost (tuned),0.651252
1,Random Forest (tuned),0.645999
0,Linear,0.477341


In [9]:
print(rf_reg_tuned.best_params_)
print(gb_reg_tuned.best_params_)
print(xgb_reg_tuned.best_params_)

{'criterion': 'absolute_error', 'max_depth': None, 'n_estimators': 100, 'random_state': 0}
{'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100}
{'n_estimators': 50, 'random_state': 0}
