In [4]:
%load_ext kedro.ipython
%reload_kedro
catalog.list()


[1m[[0m
    [32m'books_raw'[0m,
    [32m'books_loaded'[0m,
    [32m'publisher_consolidation'[0m,
    [32m'price_by_isbn_input'[0m,
    [32m'price_by_isbn'[0m,
    [32m'book_genres'[0m,
    [32m'title_embeddings_original'[0m,
    [32m'authors_embeddings'[0m,
    [32m'description_embeddings'[0m,
    [32m'open_library_book_api_info_input'[0m,
    [32m'open_library_book_api_info'[0m,
    [32m'filtered_books'[0m,
    [32m'exclusions_summary'[0m,
    [32m'scope_waterfall_plot'[0m,
    [32m'books_features'[0m,
    [32m'feature_cutoffs'[0m,
    [32m'factor_lumper'[0m,
    [32m'model_train'[0m,
    [32m'model_valid'[0m,
    [32m'model_test'[0m,
    [32m'flaml_model_trained'[0m,
    [32m'flaml_model_test_results'[0m,
    [32m'flaml_regressor_scatterplot'[0m,
    [32m'all_model_results'[0m,
    [32m'parameters'[0m,
    [32m'params:title_column'[0m,
    [32m'params:author_column'[0m,
    [32m'params:description_column'[0m,
    [32m'params

In [5]:
# Import libraries
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import BayesianRidge, Ridge,ElasticNet
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, RepeatedKFold
import scipy 
from scipy.stats import uniform, loguniform



# Purpose of the NB

## Testing models and hyperparameters tuning

![image.png](attachment:image.png)

# DATA MANIPULATION

In [6]:
# Load data 
train_data = catalog.load("model_train")

test_data = catalog.load("model_test")

valid_data = catalog.load("model_valid")

In [7]:
train_data

Unnamed: 0,title,isbn,most_common_author,publisher,language_code,Description,genre,AuthorFameLevel,book_count_category,publisher_book_count_category,...,ratings_count,text_reviews_count,Price,DescriptionISBN,isbn13_y,cluster,UMAP1,UMAP2,text_reviews_percentage,stratify_col
0,other,other,other,other,eng,other,non-fiction,4 - popular,3 - many books written,3 - many books published,...,2717,232,20.500000,9780674639270,9780674639270,3,10.773099,7.143039,0.085388,True
1,other,other,other,other,eng,other,"fantasy, paranormal",5 - semi-famous,3 - many books written,3 - many books published,...,26237,240,9.990000,9780142501528,9780142501528,6,11.991917,-0.779016,0.009147,True
2,other,other,other,pocket books,eng,other,"mystery, thriller, crime",5 - semi-famous,1 - few books written,3 - many books published,...,1353,53,7.990000,9780671793562,9780671793562,7,8.704370,8.494351,0.039172,True
3,other,other,other,other,eng,other,"history, historical fiction, biography",1 - very low exposure,1 - few books written,3 - many books published,...,26,1,16.950000,9780486285610,9780486285610,2,11.446075,5.444993,0.038462,False
4,other,other,other,other,eng,other,non-fiction,1 - very low exposure,1 - few books written,1 - few books published,...,82,13,15.950000,9780735200302,9780735200302,1,9.463834,8.225148,0.158537,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5863,other,other,other,other,eng,other,fiction,3 - well-known,1 - few books written,3 - many books published,...,744,9,3.980000,9780441581054,9780441581054,7,7.793371,10.453768,0.012097,True
5864,other,other,other,other,eng,other,"history, historical fiction, biography",4 - popular,2 - some books written,1 - few books published,...,4799,88,6.213333,9780064409902,9780064409902,0,14.945441,-0.366986,0.018337,True
5865,other,other,laurell k. hamilton,berkley,eng,other,"fantasy, paranormal",6 - famous,3 - many books written,3 - many books published,...,67686,1409,22.000000,9780425204665,9780425204665,0,12.722993,-1.087720,0.020817,True
5866,other,other,other,other,eng,other,non-fiction,5 - semi-famous,2 - some books written,3 - many books published,...,106493,6561,18.000000,9780812971064,9780812971064,5,14.428804,1.181401,0.061610,False


In [8]:
# Dropping the 'title' column from the train_data 
train_data = train_data.drop('title', axis=1)

# Storing the features from train data inside a variable
features = ['most_common_author', 'publisher', 'AuthorFameLevel', 'book_length_category', 'genre', 'Price', 'cluster', 'engagement_level']

# Feature selection
y_train = train_data['average_rating']
X_train = train_data[features]

# Setup a one hot encoder to encode the categorical features from X_train

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(X_train)
# Transform the categorical features from X_train
X_train_encoded = encoder.transform(X_train)
# Convert the encoded features into a dataframe
X_train_encoded = pd.DataFrame(X_train_encoded.toarray(), columns=encoder.get_feature_names_out())


# MODEL 1

In [10]:


# Setup a BayesianRidge model
model = BayesianRidge()

# Define a search space for random search over a BayesianRidge model
param_distributions = {
    'alpha_1': loguniform(1e-5, 1e5),
    'alpha_2': loguniform(1e-5, 1e5),
    'lambda_1': loguniform(1e-5, 1e5),
    'lambda_2': loguniform(1e-5, 1e5),
    'alpha_init': loguniform(1e-5, 1e5),
    'lambda_init': loguniform(1e-5, 1e5),
    'tol': loguniform(1e-10, 1e3)
}

# Define cv as a 10 folds, 3 repeat RepeatedKFold

cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

# Define a random search over a BayesianRidge model
random_search = RandomizedSearchCV(model, param_distributions, cv=cv, scoring='r2', n_jobs=2, random_state=42)

# Fit the random search over a BayesianRidge model
result = random_search.fit(X_train_encoded, y_train)

# Print the best parameters found by the random search
print("Best parameters found by the random search: ", result.best_params_)

# Print the best score found by the random search
print("Best score found by the random search: ", result.best_score_)


Best parameters found by the random search:  {'alpha_1': 7.672290184186796e-05, 'alpha_2': 0.0009116510241437176, 'alpha_init': 2.8331716611212512e-05, 'lambda_1': 0.017918568349691386, 'lambda_2': 0.0770500450348967, 'lambda_init': 0.0051699973172927354, 'tol': 5.937281208438848}
Best score found by the random search:  0.19762387003687557


# MODEL 2

In [11]:
# Setup a ridge model
model_ridge = Ridge()

# Define a parameter grid to search for the best hyperparameters
param_grid = {
    'alpha': [0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False]
}

search_ridge = RandomizedSearchCV(model_ridge, param_grid, cv=cv , scoring='r2', n_jobs=-1, random_state=42)

# Fit the random search over a BayesianRidge model
result_ridge = search_ridge.fit(X_train_encoded, y_train)

# Print the best parameters found by the random search
print("Best parameters found by the random search: ", result_ridge.best_params_)

# Print the best score found by the random search
print("Best score found by the random search: ", result_ridge.best_score_)

Best parameters found by the random search:  {'fit_intercept': True, 'alpha': 10}
Best score found by the random search:  0.1976842703608107


# MODEL 3

In [12]:
# Setup elasticnet model from sklearn


model_elasticnet = ElasticNet()

# Define a parameter grid to search for the best hyperparameters
param_grid = {
    'alpha': [0.01, 0.1, 1, 10],
    'l1_ratio': [0.1, 0.5, 0.9]
}

search_elasticnet = RandomizedSearchCV(model_elasticnet, param_grid, cv=cv, scoring='r2')

result_elastic = search_elasticnet.fit(X_train_encoded, y_train)

# Print the best hyperparameters and the corresponding R2 score
print("Best Hyperparameters:", result_elastic.best_params_)
print("Best R2 Score:", result_elastic.best_score_)

Best Hyperparameters: {'l1_ratio': 0.1, 'alpha': 0.01}
Best R2 Score: 0.16937052386968976
