# **Made by**: Saveliy Ugrinchuk

In [211]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

## **Loading dataframes**

In [212]:
train_dataframe = pd.read_csv('../input/hse-aml-2022/books_train.csv')
test_dataframe = pd.read_csv('../input/hse-aml-2022/books_test.csv')
example = pd.read_csv('../input/hse-aml-2022/books_test.csv')
len(example)

## **Exploring and clearing dataset**

In [213]:
train_dataframe

In [214]:
train_dataframe.info()

Renaming column **'num_pages'** since it has unnecessary spaces in the name

In [215]:
train_dataframe.rename(columns = {'  num_pages': 'num_pages'}, inplace=True)
test_dataframe.rename(columns = {'  num_pages': 'num_pages'}, inplace=True)

In [216]:
train_dataframe.describe()

Books with 0 or 4736 pages seem unrealistic, so we will eliminate books with fewer than 10 pages or more than 2000

**isbn** and **isbn13** don't look like useful metrics, as they just represent a book number and we already have **bookID**, so they can be removed from the dataframe.

In [217]:
# train dataframe
train_dataframe = train_dataframe[train_dataframe.num_pages > 10]
train_dataframe = train_dataframe[train_dataframe.num_pages <= 2000]
train_dataframe = train_dataframe.drop(columns=['isbn', 'isbn13'])
# test dataframe
test_dataframe = test_dataframe.drop(columns=['isbn', 'isbn13'])

train_dataframe.describe()

Changing **language_code** from string values to int values, turning them into categories. Same procedure for **publisher** column.

In [218]:
language_codes = {'language_code': {'en-US': 'eng','en-CA': 'eng','en-GB': 'eng'}}
# train dataframe
train_dataframe.replace(language_codes, inplace=True)
train_dataframe.language_code = train_dataframe.language_code.astype('category').cat.codes
train_dataframe.publisher = train_dataframe.publisher.astype('category').cat.codes
# test dataframe
test_dataframe.replace(language_codes, inplace=True)
test_dataframe.language_code = test_dataframe.language_code.astype('category').cat.codes
test_dataframe.publisher = test_dataframe.publisher.astype('category').cat.codes

# Converting publication_date to year
# test dataframe
train_dataframe.publication_date = pd.to_datetime(train_dataframe.publication_date, errors='coerce', format='%m/%d/%Y')
train_dataframe.publication_date = pd.DatetimeIndex(train_dataframe.publication_date).year
train_dataframe = train_dataframe.dropna()   # drop empty values
train_dataframe['year'] = train_dataframe.publication_date.astype(int)
# train dataframe
test_dataframe.publication_date = pd.to_datetime(test_dataframe.publication_date, errors='coerce', format='%m/%d/%Y')
test_dataframe.publication_date = pd.DatetimeIndex(test_dataframe.publication_date).year
test_dataframe = test_dataframe.dropna()   # drop empty values
test_dataframe['year'] = test_dataframe.publication_date.astype(int)


# Dropping title and authors since I can't imagine how they can be used for the model. Also drop 'publication_date' because it is no longer needed
# train dataframe
train_dataframe = train_dataframe.drop(columns=['title', 'authors', 'publication_date'])
# test dataframe
test_dataframe = test_dataframe.drop(columns=['title', 'authors', 'publication_date'])

In [219]:
train_dataframe.head()

## **Training and testing of the model**

Separating the metric of the average rating from the rest of the dataframe for further splitting into samples of training and test data

In [220]:
average_ratings = train_dataframe.average_rating
train_dataframe = train_dataframe.drop(columns=['average_rating'])
average_ratings

Test data split will contain 30% of the data

In [221]:
X_Train, X_Test, Y_Train, Y_Test = train_test_split(train_dataframe, average_ratings, test_size=0.3)

Trying to define best model

In [222]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

model_linear = LinearRegression()
model_ridge = Ridge()
model_GBR = GradientBoostingRegressor()

model_linear.fit(X_Train, Y_Train)
print('LinearRegression', model_linear.score(X_Test, Y_Test) * 100)

model_ridge.fit(X_Train, Y_Train)
print('Ridge', model_ridge.score(X_Test, Y_Test) * 100)

model_GBR.fit(X_Train, Y_Train)
print('GradientBoostingRegressor', model_GBR.score(X_Test, Y_Test) * 100)

regr = AdaBoostRegressor()
regr.fit(X_Train, Y_Train)
print('AdaBoostRegressor', regr.score(X_Test, Y_Test) * 100)

GradientBoostingRegressor has better score, I will use it to predict the ratings of books

In [223]:
test_dataframe.head()

In [224]:
predicted_data = model_GBR.predict(test_dataframe)
predicted_data

In [225]:
books_sample_submission = pd.read_csv('../input/hse-aml-2022/books_sample_submission.csv')
books_sample_submission['average_rating'] = predicted_data
books_sample_submission.to_csv('predicted_ratings_GBR_submission.csv', index=False)
print(books_sample_submission)