## Imporing libraries

In [96]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

## Importing The dataset.

In [97]:
books_df = pd.read_csv("books_data.csv")
books_df.head(-1)

Unnamed: 0,ISBN,Title,Authors,Publisher,Published Date,Rating,Review Count
0,0195153448,Classical Mythology,"['Mark P. O. Morford', 'Robert J. Lenardon']","Oxford University Press, USA",2003,,
1,0060973129,Decision in Normandy,"[""Carlo D'Este""]",Harper Perennial,1991,4.0,2.0
2,0374157065,Flu,['Gina Bari Kolata'],Macmillan,1999,3.5,18.0
3,0393045218,The Mummies of Ürümchi,['E. J. W. Barber'],W. W. Norton,1999,4.0,3.0
4,0399135782,The Kitchen God's Wife,['Amy Tan'],Putnam Publishing Group,1991,4.0,34.0
...,...,...,...,...,...,...,...
887,0380752115,Whisper to Me of Love,['Shirlee Busbee'],,1991,2.0,1.0
888,0553212583,Wuthering Heights,['Emily Brontë'],Bantam Classics,1974,3.5,55.0
889,0446364762,The Stars Shine Down,['Sidney Sheldon'],Grand Central Publishing,1993-09-01,4.0,14.0
890,0425166619,Toxin,['Robin Cook'],Berkley Publishing Group,1999,3.5,14.0


In [85]:
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 893 entries, 0 to 892
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   ISBN            893 non-null    object 
 1   Title           893 non-null    object 
 2   Authors         891 non-null    object 
 3   Publisher       747 non-null    object 
 4   Published Date  893 non-null    object 
 5   Rating          647 non-null    float64
 6   Review Count    647 non-null    float64
dtypes: float64(2), object(5)
memory usage: 49.0+ KB


## Data preprocessing

In [86]:
books_df['year'] = pd.DatetimeIndex(books_df['Published Date']).year
books_df.head()

Unnamed: 0,ISBN,Title,Authors,Publisher,Published Date,Rating,Review Count,year
0,195153448,Classical Mythology,"['Mark P. O. Morford', 'Robert J. Lenardon']","Oxford University Press, USA",2003,,,2003
1,60973129,Decision in Normandy,"[""Carlo D'Este""]",Harper Perennial,1991,4.0,2.0,1991
2,374157065,Flu,['Gina Bari Kolata'],Macmillan,1999,3.5,18.0,1999
3,393045218,The Mummies of Ürümchi,['E. J. W. Barber'],W. W. Norton,1999,4.0,3.0,1999
4,399135782,The Kitchen God's Wife,['Amy Tan'],Putnam Publishing Group,1991,4.0,34.0,1991


In [87]:
books_df.isnull().sum()

ISBN                0
Title               0
Authors             2
Publisher         146
Published Date      0
Rating            246
Review Count      246
year                0
dtype: int64

In [88]:
books_df = books_df.dropna()
books_df.isnull().sum()

ISBN              0
Title             0
Authors           0
Publisher         0
Published Date    0
Rating            0
Review Count      0
year              0
dtype: int64

## Feature Engineering

In [89]:
le = LabelEncoder()
books_df['Publisher'] = le.fit_transform(books_df['Publisher'])
books_df['Authors'] = le.fit_transform(books_df['Authors'])

# Split the data into training and test sets
X = books_df[['Rating', 'Publisher', 'Authors', 'year']]
y = books_df['Review Count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Linear Regression

In [90]:
# Create and fit the linear regression model
reg = LinearRegression().fit(X_train, y_train)

# Make predictions on the test set
y_pred = reg.predict(X_test)

# Evaluate the model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R-Squared: ", r2_score(y_test, y_pred))

Mean Absolute Error:  267.78144293982245
Mean Squared Error:  305065.7410785371
R-Squared:  -0.015146731230721855


Prediction


In [91]:
y_pred

array([188.87427808, 102.0608493 , 221.02504512, 211.30579069,
       264.26036247, 251.13892481, 273.41286746, 208.46190136,
       152.36452616, 202.95649313, 196.21974933, 216.160899  ,
       241.15409998, 260.0685406 ,  89.42483838, 161.5320166 ,
       133.06591983, 196.31169085, 183.21174583, 154.2866211 ,
        49.33390459, 210.62217503, 218.2439302 , 148.34093391,
       170.69998402, 262.08652133, 114.40581197, 253.68650893,
       162.02068441, 211.40477572, 181.06691227, 269.19440393,
       287.97889247, 307.96903963, 348.3087168 , 116.66402083,
       182.57416007, 148.53492308, 319.53837243, 246.1252647 ,
       233.37693774,  60.54567162, 230.41691459, 195.05163728,
       212.61052936,  21.25712209, 179.59693388, 361.93985647,
       176.89954507, 180.16774163, 182.08420632,  71.34913447,
       249.76651348, 146.12766424, 164.10637866, 168.04890321,
       194.18326117, 298.29617342, 305.35548175, 127.7392831 ,
       233.66375055, 260.45462444, 187.18724549, 185.50

## Random Forest Regressor

In [92]:
# One-hot encode categorical features (publisher and author)
X = pd.get_dummies(X, columns=['Publisher', 'Authors'], drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
rf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf.predict(X_test)

# Evaluate the model
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R-Squared: ", r2_score(y_test, y_pred))


Mean Absolute Error:  167.78179420289857
Mean Squared Error:  335031.5223296995
R-Squared:  -0.11486184436779823


prediction

In [93]:
y_pred

array([1.12400000e+01, 1.00000000e+00, 7.32000000e+00, 8.59000000e+00,
       9.27000000e+00, 5.40000000e+00, 5.13000000e+00, 5.21000000e+00,
       1.05600000e+01, 6.53000000e+00, 9.07160000e+02, 8.01000000e+00,
       7.57000000e+00, 2.86600000e+00, 7.70300000e+01, 1.08300000e+01,
       3.79000000e+01, 8.47000000e+00, 3.98350000e+00, 5.94000000e+00,
       2.66000000e+00, 1.97307000e+03, 6.46000000e+00, 1.00000000e+00,
       1.28800000e+01, 6.27000000e+00, 4.57000000e+00, 4.31000000e+00,
       7.37000000e+00, 3.38000000e+00, 9.10130000e+02, 8.59000000e+00,
       6.80000000e+00, 4.24000000e+00, 5.18000000e+00, 5.21500000e+01,
       4.25666667e+01, 3.10000000e+00, 3.09520000e+02, 5.12000000e+00,
       1.23224000e+03, 1.00000000e+00, 1.26700000e+01, 4.66000000e+00,
       1.67000000e+00, 6.84800000e+01, 7.35000000e+00, 4.72000000e+00,
       3.59000000e+00, 8.30000000e+00, 1.21800000e+01, 6.54000000e+00,
       2.54000000e+00, 8.41750000e+00, 9.14750000e+00, 6.46750000e+00,
      

## XG Boost

In [94]:
# One-hot encode categorical features (publisher and author)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the model
xgb_model = xgb.XGBRegressor()

# Fit the model to the training data
xgb_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = xgb_model.predict(X_test)

# Evaluate the model
print("Mean Absolute Error: ", mean_absolute_error(y_test, y_pred))
print("Mean Squared Error: ", mean_squared_error(y_test, y_pred))
print("R-Squared: ", r2_score(y_test, y_pred))

  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


Mean Absolute Error:  181.58873048035994
Mean Squared Error:  438049.7784029963
R-Squared:  -0.45766876047763994


prediction

In [95]:
y_pred

array([  29.392454,    4.976196,   38.80099 ,   38.80099 ,   14.679569,
         28.54876 ,   28.54876 ,   32.52045 ,   28.54876 ,   28.54876 ,
       2640.581   ,   25.420765,   28.54876 ,   28.54876 ,   40.69058 ,
         28.54876 ,   28.54876 ,   24.931799,   28.54876 ,    4.976196,
         15.09506 , 2733.1487  ,   28.54876 ,    4.976196,   28.54876 ,
         38.80099 ,   28.54876 ,   14.679569,   28.54876 ,   19.066753,
         90.0029  ,   38.80099 ,   18.65126 ,   14.679569,   14.679569,
         32.52045 ,   28.54876 ,   28.54876 ,  152.60033 ,   14.679569,
       1996.9495  ,    4.976196,   11.551573,   28.54876 ,   15.22843 ,
         27.100327,   32.52045 ,   14.679569,   28.54876 ,   28.54876 ,
         28.54876 ,    4.976196,   24.931799,   28.54876 ,   28.54876 ,
         28.54876 ,   28.54876 ,   14.679569,   14.679569,   14.679569,
         28.54876 ,   28.54876 ,   32.52045 ,   28.54876 ,   15.09506 ,
         28.54876 ,   28.54876 ,   14.679569,   28.54876 ,   28.