# Modeling

In [16]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from scipy import stats

In [17]:
df = pd.read_csv('../data/PostBooksEDA.csv', index_col=0)
df_encoded = pd.read_csv('../data/PostEncodedBooksEDA.csv', index_col=0)

In [18]:
df_encoded = df_encoded.drop(columns=['user_id','isbn'], axis=1)

For the initial models we will be building, User ID and ISBN will not be needed since they are merely identifiers and do not have meaningful relationships with the target variable.

## Stats Logistic model 

Since logistic models predict values between 0 and 1, we will convert the ratings into implicit (0) and explicit (1) categories to predict whether users are likely to leave a review or not.

In [21]:
X = df_encoded.drop('rating', axis=1)
y = df_encoded['rating'].copy()

In [19]:
X.columns

Index(['user_id', 'age', 'isbn', 'year_of_publication', 'Africa', 'Asia',
       'Europe', 'North America', 'Oceania', 'South America',
       'author_frequency', 'author_implicit_encoded',
       'author_explicit_encoded', 'author_avg_all_reviews',
       'publisher_frequency', 'publisher_implicit_encoded',
       'publisher_explicit_encoded', 'publisher_avg_all_reviews', 'de', 'en',
       'es', 'fr', 'it', 'unknown_lg', 'art & culture', 'business & economics',
       'children's books', 'cookbooks', 'fiction', 'health & fitness',
       'non-fiction', 'poetry', 'religion & spirituality',
       'science & technology', 'self-help', 'unknown_category'],
      dtype='object')

In [22]:
y = y.apply(lambda x: 1 if 1 <= x <= 10 else x)

In [23]:
y

0          0
1          1
2          0
4          0
5          1
          ..
1031170    0
1031171    1
1031172    1
1031173    1
1031174    1
Name: rating, Length: 978859, dtype: int64

In [24]:
import statsmodels.api as sm 

# 0. ADD THAT CONSTANT 
X_const = sm.add_constant(X)

# 1. Instatiate the model
bank_logit = sm.Logit(y, X_const)

# 2. Fit the model to the data
bank_logit_fitted = bank_logit.fit()

# 3. Look at results (Summary table)
bank_logit_fitted.summary()

Optimization terminated successfully.
         Current function value: 0.573486
         Iterations 6


0,1,2,3
Dep. Variable:,rating,No. Observations:,978859.0
Model:,Logit,Df Residuals:,978827.0
Method:,MLE,Df Model:,31.0
Date:,"Thu, 21 Nov 2024",Pseudo R-squ.:,0.1265
Time:,23:23:01,Log-Likelihood:,-561360.0
converged:,True,LL-Null:,-642680.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-3.7907,,,,,
age,-0.0011,0.000,-4.825,0.000,-0.002,-0.001
year_of_publication,0.0049,0.000,14.907,0.000,0.004,0.006
Africa,-0.1527,,,,,
Asia,-0.6341,,,,,
Europe,-0.8208,,,,,
North America,-0.8469,,,,,
Oceania,-0.9085,,,,,
South America,-0.4277,,,,,


In [25]:
# Calculate soft predictions
y_proba = bank_logit_fitted.predict(X_const)

# Convert soft predictions to hard predictions 0/1 
y_pred = np.where(y_proba >= 0.5, 1, 0)

# Calculate # correct
num_correct = (y_pred == y).sum()

# Calculate the percentage accuracy
pct_accuracy = num_correct/X.shape[0]

print(f'The baseline model accuracy is {np.round(pct_accuracy*100.0, 3)}%')

The baseline model accuracy is 68.326%
