In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
import plotly as ply
import sklearn as skl
import datetime as dt 
import math

from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor 
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPClassifier

In [2]:
# load data 
df = pd.read_csv("books.csv")

# data processing

In [3]:
# num_pages space canceled
df = df.rename(columns= {"  num_pages": "num_pages"})

In [8]:
# English dummies
df['english'] = np.where(df['language_code'].str[0:2] == 'en', 1, 0)

In [9]:
cols_adjust = df.columns.tolist()
cols_adjust

['bookID',
 'title',
 'authors',
 'average_rating',
 'isbn',
 'isbn13',
 'language_code',
 'num_pages',
 'ratings_count',
 'text_reviews_count',
 'publication_date',
 'publisher',
 'english']

In [10]:
df.head(2)

Unnamed: 0,bookID,title,authors,average_rating,isbn,isbn13,language_code,num_pages,ratings_count,text_reviews_count,publication_date,publisher,english
0,1,Harry Potter and the Half-Blood Prince (Harry ...,J.K. Rowling/Mary GrandPré,4.57,439785960,9780439785969,eng,652,2095690,27591,9/16/2006,Scholastic Inc.,1
1,2,Harry Potter and the Order of the Phoenix (Har...,J.K. Rowling/Mary GrandPré,4.49,439358078,9780439358071,eng,870,2153167,29221,9/1/2004,Scholastic Inc.,1


# setting up R-squared and RMSE indices

In [22]:
def model_cross_val(models={}, X_test=None, y_test=None, cv=None, scoring=()):
    scoring_method = ('r2', 'neg_root_mean_squared_error')

    table = {"r2": [], "neg_root_mean_squared_error": []}

    for mod in models:
        for sco in scoring_method:
            score = cross_val_score(models[mod], X_test, y_test, cv=cv, scoring=sco).mean()
            if sco == 'r2':
                table[sco].append(score)
            else:
                table[sco].append(abs(score))

    print(
        pd.DataFrame(data = table, index = models, columns=table.keys())
        .sort_values(by=list(table.keys())[0], ascending=False)
    )

# setting up model selecting generator function

In [29]:
def model_selecting_generator(df=None):
    train0, test = train_test_split(df, test_size=20, random_state=42)

    # split train0 in train and valid
    train, valid = train_test_split(train0, test_size=40, random_state=42)

    # subsetting
    # train0
    X_train0 = train0.drop("average_rating", axis = 1)
    y_train0 = train0.average_rating

    # train
    X_train = train.drop("average_rating", axis = 1)
    y_train = train.average_rating

    # valid
    X_valid = valid.drop("average_rating", axis = 1)
    y_valid = valid.average_rating

    # test
    X_test = test.drop("average_rating", axis = 1)
    y_test = test.average_rating


    ## model selection process
    # Linear Regression
    reg = linear_model.LinearRegression()
    reg.fit(X_train, y_train)

    # ridge model
    reg_ridge = linear_model.Ridge(alpha=0.01)
    reg_ridge.fit(X_train,y_train)

    # ridge CV
    reg_ridge_CV = linear_model.RidgeCV(alphas=[0.001,0.01,0.1,1,10],cv=5)
    reg_ridge_CV.fit(X_train0,y_train0)

    # lasso model
    reg_lasso = linear_model.Lasso(alpha=0.5)
    reg_lasso.fit(X_train,y_train)

    # random forest
    rf = RandomForestRegressor(n_estimators = 100, random_state = 42)
    rf.fit(X_train, y_train)

    ## R2 + RMSE table
    models = {
        'linear reg': reg, 
        'ridge model': reg_ridge,
        'lasso model': reg_lasso,
        'ridge CV': reg_ridge_CV,
        'random forest': rf
    }

    return model_cross_val(models, X_test, y_test, cv=10)

# index of data frames - part 2

In [40]:
df2_0 = df[['average_rating', 'num_pages', 'text_reviews_count', 'ratings_count']]
df2_1 = df[['average_rating', 'num_pages', 'text_reviews_count']]
df2_2 = df[['average_rating', 'num_pages', 'ratings_count']]
df2_3 = df[['average_rating', 'ratings_count', 'text_reviews_count']]
df2_4 = df[['average_rating', 'num_pages']]
df2_5 = df[['average_rating', 'text_reviews_count']]
df2_6 = df[['average_rating', 'ratings_count']]

df2_10 = df[['average_rating', 'num_pages', 'text_reviews_count', 'ratings_count', 'english']]
df2_11 = df[['average_rating', 'num_pages', 'text_reviews_count', 'english']]
df2_12 = df[['average_rating', 'ratings_count', 'num_pages', 'english']]
df2_13 = df[['average_rating', 'ratings_count', 'text_reviews_count', 'english']]
df2_14 = df[['average_rating', 'num_pages', 'english']]
df2_15 = df[['average_rating', 'text_reviews_count', 'english']]
df2_16 = df[['average_rating', 'ratings_count', 'english']]

In [49]:
model_selecting_generator(df2_0) # 'num_pages', 'text_reviews_count', 'ratings_count'

                     r2  neg_root_mean_squared_error
lasso model   -1.368071                     0.169600
ridge CV      -2.063746                     0.181474
ridge model   -2.066523                     0.181517
linear reg    -2.066526                     0.181517
random forest -4.580882                     0.184104


In [39]:
model_selecting_generator(df2_1) # 'num_pages', 'text_reviews_count'

                     r2  neg_root_mean_squared_error
random forest -1.610271                     0.166806
lasso model   -2.502406                     0.180121
ridge CV      -2.591104                     0.181346
ridge model   -2.591114                     0.181346
linear reg    -2.591114                     0.181346


In [31]:
model_selecting_generator(df2_2) # 'num_pages', 'ratings_count'

                     r2  neg_root_mean_squared_error
linear reg    -2.613689                     0.180322
ridge model   -2.613689                     0.180322
ridge CV      -2.613691                     0.180322
lasso model   -2.622429                     0.180983
random forest -6.087708                     0.192125


In [32]:
model_selecting_generator(df2_3) # 'ratings_count', 'text_reviews_count'

                     r2  neg_root_mean_squared_error
lasso model   -1.327372                     0.172451
ridge CV      -2.269475                     0.185311
ridge model   -2.273257                     0.185362
linear reg    -2.273261                     0.185363
random forest -8.370762                     0.216634


In [33]:
model_selecting_generator(df2_4) # 'num_pages'

                     r2  neg_root_mean_squared_error
lasso model   -1.902108                     0.168400
ridge CV      -1.924998                     0.168308
ridge model   -1.925007                     0.168308
linear reg    -1.925007                     0.168308
random forest -5.498188                     0.210366


In [34]:
model_selecting_generator(df2_5) # 'text_reviews_count'

                     r2  neg_root_mean_squared_error
lasso model   -3.106730                     0.199547
ridge CV      -3.231326                     0.201672
ridge model   -3.231354                     0.201672
linear reg    -3.231354                     0.201672
random forest -7.967233                     0.211748


In [35]:
model_selecting_generator(df2_6) # 'ratings_count'

                     r2  neg_root_mean_squared_error
lasso model   -3.055845                     0.196249
ridge CV      -3.059901                     0.196359
ridge model   -3.059901                     0.196359
linear reg    -3.059901                     0.196359
random forest -5.305446                     0.211069


In [41]:
model_selecting_generator(df2_10) # 'num_pages', 'text_reviews_count', 'ratings_count', 'english'

                     r2  neg_root_mean_squared_error
lasso model   -1.368071                     0.169600
ridge model   -3.422697                     0.178487
linear reg    -3.434876                     0.178518
ridge CV      -3.482528                     0.179340
random forest -4.609585                     0.180090


In [42]:
model_selecting_generator(df2_11) # 'num_pages', 'text_reviews_count', 'english'

                     r2  neg_root_mean_squared_error
random forest -2.259819                     0.172226
lasso model   -2.502406                     0.180121
ridge CV      -2.888840                     0.175594
ridge model   -4.392116                     0.178037
linear reg    -4.405216                     0.178066


In [43]:
model_selecting_generator(df2_12) # 'ratings_count', 'num_pages', 'english'

                     r2  neg_root_mean_squared_error
lasso model   -2.622480                     0.180983
ridge CV      -2.966749                     0.179707
ridge model   -4.242889                     0.176619
linear reg    -4.255240                     0.176644
random forest -5.930726                     0.184772


In [44]:
model_selecting_generator(df2_13) # 'ratings_count', 'text_reviews_count', 'english'

                     r2  neg_root_mean_squared_error
lasso model   -1.327372                     0.172451
ridge model   -3.005581                     0.177017
linear reg    -3.013765                     0.177008
ridge CV      -3.366750                     0.183778
random forest -8.076784                     0.214455


In [45]:
model_selecting_generator(df2_14) # 'num_pages', 'english'

                     r2  neg_root_mean_squared_error
lasso model   -1.902108                     0.168400
ridge CV      -2.056476                     0.165841
ridge model   -3.337234                     0.162727
linear reg    -3.348148                     0.162740
random forest -6.381610                     0.194501


In [46]:
model_selecting_generator(df2_15) # 'text_reviews_count', 'english'

                     r2  neg_root_mean_squared_error
lasso model   -3.106730                     0.199547
ridge CV      -3.362473                     0.194362
ridge model   -4.674702                     0.193726
linear reg    -4.685481                     0.193729
random forest -8.243794                     0.215569


In [48]:
model_selecting_generator(df2_16) # 'ratings_count', 'english'

                     r2  neg_root_mean_squared_error
lasso model   -3.055845                     0.196249
ridge CV      -3.212549                     0.192624
ridge model   -4.285479                     0.186940
linear reg    -4.295301                     0.186936
random forest -5.300748                     0.206228
