In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

In [19]:
#Load CSV data into Pandas DataFrame
csv_file_path = '../Resources/panda_df/books_cleaned.csv'
df = pd.read_csv(csv_file_path)

In [20]:
#Data Preparation
# X = df[['to_read_count', 'series_binary', 'author_average_rating', 'ratings_count', 'text_reviews_count']]
# X = df[['num_pages', 'publication_year', 'to_read_count', 'series_binary', 'author_average_rating']]
y = df['average_rating']

In [5]:
#Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
#Fill in missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)


In [7]:
#Model Training
model = LinearRegression()
model.fit(X_train_imputed, y_train)

In [8]:
#Apply the imputer to the testing set
X_test_imputed = imputer.transform(X_test)

#Model Evaluation
y_pred = model.predict(X_test_imputed)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

Mean Squared Error: 0.11775396598881109
R-squared: 0.5152547969739771


In [9]:
#Make predictions on new data
new_data = pd.DataFrame({
    'num_pages': [300],
    'publication_year': [2015],
    'to_read_count': [100],
    'series_binary': [0],
    'author_average_rating': [4.5]
})

# Use the trained model to make predictions on the new data
new_data_imputed = imputer.transform(new_data)
predictions = model.predict(new_data_imputed)

print(predictions)


#Create a DataFrame with the predicted ratings
comparison_df = pd.DataFrame({
    'average_rating': predictions
})

#Display the comparison DataFrame
print(comparison_df)

[4.4510765]
   average_rating
0        4.451077


In [10]:
#Mean Squared Error: Average squared difference between the predicted and actual values
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')

Mean Squared Error: 0.11775396598881109


In [11]:
#R-squared (Coefficient of Determination): Proportion of variance in the dependent variable that is predictable from the independent variables
r2 = r2_score(y_test, y_pred)
print(f'R-squared: {r2}')

R-squared: 0.5152547969739771
