### Import Libraries

In [78]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

### Load Data

In [79]:
metadata_url = "https://raw.githubusercontent.com/Deepthi-Nasika/Deepthi-Nasika/main/Data/book_popularity/SPGC-metadata-2018-07-18.csv"
kld_url = "https://raw.githubusercontent.com/Deepthi-Nasika/Deepthi-Nasika/main/Data/book_popularity/KLDscores.csv"
extra_controls_url = "https://raw.githubusercontent.com/Deepthi-Nasika/Deepthi-Nasika/main/Data/book_popularity/extra_controls.csv"

metadata = pd.read_csv(metadata_url)
kld_scores = pd.read_csv(kld_url)
extra_controls = pd.read_csv(extra_controls_url)

In [80]:
metadata.head()

Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type
0,PG0,,,,,,,set(),Text
1,PG1,The Declaration of Independence of the United ...,"Jefferson, Thomas",1743.0,1826.0,['en'],604.0,"{'United States -- History -- Revolution, 1775...",Text
2,PG2,The United States Bill of Rights: The Ten Orig...,United States,,,['en'],158.0,"{'Civil rights -- United States -- Sources', '...",Text
3,PG3,John F. Kennedy's Inaugural Address,"Kennedy, John F. (John Fitzgerald)",1917.0,1963.0,['en'],28.0,{'Presidents -- United States -- Inaugural add...,Text
4,PG4,Lincoln's Gettysburg Address: Given November 1...,"Lincoln, Abraham",1809.0,1865.0,['en'],55.0,{'Consecration of cemeteries -- Pennsylvania -...,Text


In [81]:
kld_scores.head()

Unnamed: 0,filename,kld_values
0,PG10002,"[0.22391005737243896, 0.24226261808703536, 0.2..."
1,PG10005,"[0.24107767463211327, 0.24747085497572513, 0.2..."
2,PG10003,"[0.2502283960399736, 0.2304129699198611, 0.238..."
3,PG10008,"[0.2576982842724978, 0.2424932127358288, 0.220..."
4,PG10012,"[0.25125974534678364, 0.23622148585532693, 0.2..."


In [82]:
extra_controls.head()

Unnamed: 0,id,subj2_war,subj2_adventure,subj2_comedy,subj2_biography,subj2_romance,subj2_drama,subj2_fantasy,subj2_family,subj2_sciencefiction,...,subj2_horror,subj2_mystery,subj2_crime,subj2_history,subj2_periodicals,subj2_others,speed,sentiment_avg,sentiment_vol,wordcount
0,PG10002,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0.224141,0.08737,0.002701,50831.0
1,PG10005,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0.221966,0.13182,0.003394,70142.0
2,PG10006,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.388199,0.18372,0.001003,13193.0
3,PG10007,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0.301656,0.15722,0.002895,27980.0
4,PG10008,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0.204838,0.10778,0.003655,65421.0


### Missing Values

In [83]:
metadata.isnull().sum()

id                       0
title                   71
author                2262
authoryearofbirth    14767
authoryearofdeath    15863
language                 2
downloads                2
subjects                 0
type                     0
dtype: int64

In [84]:
kld_scores.isnull().sum()

filename      0
kld_values    0
dtype: int64

In [85]:
extra_controls.isnull().sum()

id                         0
subj2_war                  0
subj2_adventure            0
subj2_comedy               0
subj2_biography            0
subj2_romance              0
subj2_drama                0
subj2_fantasy              0
subj2_family               0
subj2_sciencefiction       0
subj2_action               0
subj2_thriller             0
subj2_western              0
subj2_horror               0
subj2_mystery              0
subj2_crime                0
subj2_history              0
subj2_periodicals          0
subj2_others               0
speed                   2688
sentiment_avg           2688
sentiment_vol           2688
wordcount               2688
dtype: int64

### Handling Missing Values

In [86]:
metadata.fillna({'title': 'Unknown_title',
                 'author': 'Unknown_author',
                 'authoryearofbirth': metadata['authoryearofbirth'].median(),
                 'authoryearofdeath': metadata['authoryearofdeath'].median(),
                 'language': metadata['language'].mode()[0],
                 'downloads': metadata['downloads'].mode()[0]}, inplace = True)

extra_controls.fillna({'speed': extra_controls['speed'].median(),
                       'sentiment_avg': extra_controls['sentiment_avg'].median(),
                       'sentiment_vol': extra_controls['sentiment_vol'].median(),
                       'wordcount': extra_controls['wordcount'].median()}, inplace = True)

In [87]:
metadata.isnull().sum()

id                   0
title                0
author               0
authoryearofbirth    0
authoryearofdeath    0
language             0
downloads            0
subjects             0
type                 0
dtype: int64

In [88]:
extra_controls.isnull().sum()

id                      0
subj2_war               0
subj2_adventure         0
subj2_comedy            0
subj2_biography         0
subj2_romance           0
subj2_drama             0
subj2_fantasy           0
subj2_family            0
subj2_sciencefiction    0
subj2_action            0
subj2_thriller          0
subj2_western           0
subj2_horror            0
subj2_mystery           0
subj2_crime             0
subj2_history           0
subj2_periodicals       0
subj2_others            0
speed                   0
sentiment_avg           0
sentiment_vol           0
wordcount               0
dtype: int64

### Book-Level KLD Measures


In [89]:
book_measures = []

# Function to calculate slope and intercept of linear regression

def find_slope_and_intercept(kld_scores):
  x = np.arange(len(kld_scores)).reshape(-1, 1)
  y = np.array(kld_scores).reshape(-1, 1)
  model = LinearRegression().fit(x, y)
  slope = model.coef_[0][0]
  intercept = model.intercept_[0]
  return slope, intercept

for i,j in kld_scores.iterrows():
  book_id = j['filename']
  kld_scores_list = eval(j['kld_values'])

  min_kld = np.min(kld_scores_list)
  max_kld = np.max(kld_scores_list)
  range_kld = max_kld - min_kld
  mean_kld = np.mean(kld_scores_list)
  var_kld = np.var(kld_scores_list)
  std_kld = np.std(kld_scores_list)
  median_kld = np.median(kld_scores_list)
  slope_kld, intercept_kld = find_slope_and_intercept(kld_scores_list)
  autocorrelation_kld = [pd.Series(kld_scores_list).autocorr(lag) for lag in range(1, min(10, len(kld_scores_list) - 1))]
  entropy_kld = stats.entropy(kld_scores_list)
  moving_avg_kld = pd.Series(kld_scores_list).rolling(window=5).mean().tolist()

  book_measures.append([book_id, min_kld, max_kld, range_kld, mean_kld, var_kld, std_kld, median_kld, slope_kld, intercept_kld, autocorrelation_kld, entropy_kld, moving_avg_kld])


# Create dataframe of book-level measures
cols = ['Book_ID', 'Min_KLD', 'Max_KLD', 'Range_KLD', 'Mean_KLD', 'Var_KLD', 'Std_KLD', 'Median_KLD', 'Slope_KLD', 'Intercept_KLD', 'Autocorrelation_KLD', 'Entropy_KLD', 'Moving_avg_KLD']
book_measures_df = pd.DataFrame(book_measures, columns=cols)

# Merge with metadata
book_level_measures = metadata.merge(book_measures_df, left_on = 'id', right_on='Book_ID').merge(extra_controls, left_on='id', right_on = 'id')

# Save the merged data to a CSV file
book_level_measures.to_csv('https://github.com/Deepthi-Nasika/Deepthi-Nasika/blob/main/Data/book_popularity/book_level_measures.csv', index=False)


print(book_level_measures.head())


      id                             title                 author  \
0   PG79               Terminal Compromise        Schwartau, Winn   
1  PG102  The Tragedy of Pudd'nhead Wilson            Twain, Mark   
2  PG105                        Persuasion           Austen, Jane   
3  PG106            Jungle Tales of Tarzan  Burroughs, Edgar Rice   
4  PG107        Far from the Madding Crowd          Hardy, Thomas   

   authoryearofbirth  authoryearofdeath language  downloads  \
0             1848.0             1914.0   ['en']       21.0   
1             1835.0             1910.0   ['en']      677.0   
2             1775.0             1817.0   ['en']     2778.0   
3             1875.0             1950.0   ['en']      225.0   
4             1840.0             1928.0   ['en']      555.0   

                                            subjects  type Book_ID  ...  \
0  {'Computer security -- Fiction', 'Didactic fic...  Text    PG79  ...   
1  {'Trials (Murder) -- Fiction', 'Impostors and ...  Te

In [90]:
book_level_measures.isnull().sum()

id                      0
title                   0
author                  0
authoryearofbirth       0
authoryearofdeath       0
language                0
downloads               0
subjects                0
type                    0
Book_ID                 0
Min_KLD                 0
Max_KLD                 0
Range_KLD               0
Mean_KLD                0
Var_KLD                 0
Std_KLD                 0
Median_KLD              0
Slope_KLD               0
Intercept_KLD           0
Autocorrelation_KLD     0
Entropy_KLD             0
Moving_avg_KLD          0
subj2_war               0
subj2_adventure         0
subj2_comedy            0
subj2_biography         0
subj2_romance           0
subj2_drama             0
subj2_fantasy           0
subj2_family            0
subj2_sciencefiction    0
subj2_action            0
subj2_thriller          0
subj2_western           0
subj2_horror            0
subj2_mystery           0
subj2_crime             0
subj2_history           0
subj2_period

In [91]:
book_level_measures['log_downloads'] = np.log(book_level_measures['downloads'])
book_level_measures.head()

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0,id,title,author,authoryearofbirth,authoryearofdeath,language,downloads,subjects,type,Book_ID,...,subj2_mystery,subj2_crime,subj2_history,subj2_periodicals,subj2_others,speed,sentiment_avg,sentiment_vol,wordcount,log_downloads
0,PG79,Terminal Compromise,"Schwartau, Winn",1848.0,1914.0,['en'],21.0,"{'Computer security -- Fiction', 'Didactic fic...",Text,PG79,...,0,0,0,0,1,0.171519,0.12578,0.00763,209058.0,3.044522
1,PG102,The Tragedy of Pudd'nhead Wilson,"Twain, Mark",1835.0,1910.0,['en'],677.0,"{'Trials (Murder) -- Fiction', 'Impostors and ...",Text,PG102,...,0,0,0,0,1,0.248672,0.13318,0.00425,52465.0,6.517671
2,PG105,Persuasion,"Austen, Jane",1775.0,1817.0,['en'],2778.0,"{'Regency fiction', 'Ship captains -- Fiction'...",Text,PG105,...,0,0,0,0,0,0.175747,0.16867,0.00528,82944.0,7.929487
3,PG106,Jungle Tales of Tarzan,"Burroughs, Edgar Rice",1875.0,1950.0,['en'],225.0,"{'Tarzan (Fictitious character) -- Fiction', '...",Text,PG106,...,0,0,0,0,0,0.209854,0.1025,0.004305,73714.0,5.4161
4,PG107,Far from the Madding Crowd,"Hardy, Thomas",1840.0,1928.0,['en'],555.0,"{'Love stories', 'Wessex (England) -- Fiction'...",Text,PG107,...,0,0,0,0,0,0.167575,0.12513,0.007081,136528.0,6.318968


In [92]:
numeric_cols = book_level_measures.select_dtypes(include = ['number']).columns.tolist()
categorical_cols = book_level_measures.select_dtypes(include = ['object', 'category']).columns.tolist()
print(f"Numeric columns: {numeric_cols}")
print(f"Categorical columns: {categorical_cols}")

Numeric columns: ['authoryearofbirth', 'authoryearofdeath', 'downloads', 'Min_KLD', 'Max_KLD', 'Range_KLD', 'Mean_KLD', 'Var_KLD', 'Std_KLD', 'Median_KLD', 'Slope_KLD', 'Intercept_KLD', 'Entropy_KLD', 'subj2_war', 'subj2_adventure', 'subj2_comedy', 'subj2_biography', 'subj2_romance', 'subj2_drama', 'subj2_fantasy', 'subj2_family', 'subj2_sciencefiction', 'subj2_action', 'subj2_thriller', 'subj2_western', 'subj2_horror', 'subj2_mystery', 'subj2_crime', 'subj2_history', 'subj2_periodicals', 'subj2_others', 'speed', 'sentiment_avg', 'sentiment_vol', 'wordcount', 'log_downloads']
Categorical columns: ['id', 'title', 'author', 'language', 'subjects', 'type', 'Book_ID', 'Autocorrelation_KLD', 'Moving_avg_KLD']


### Correlation Matrix

In [93]:
# Numeric cols in book-level measures
numeric_book_level_measures = book_level_measures.select_dtypes(include = [np.number])

# Drop genre-specific columns
genre_cols = [col for col in numeric_book_level_measures.columns if col.startswith('subj2_')]
numeric_book_level_measures = numeric_book_level_measures.drop(columns=genre_cols)

# Correlation Matrix
corr_matrix = numeric_book_level_measures.corr()

# Target correlation matrix
target_var = 'log_downloads'
target_corr = corr_matrix[target_var].sort_values(ascending = False)
print(target_corr)

log_downloads        1.000000
downloads            0.388742
wordcount            0.184553
sentiment_vol        0.167854
Entropy_KLD          0.111994
Intercept_KLD        0.010765
Min_KLD             -0.000480
Median_KLD          -0.054707
Mean_KLD            -0.078829
Var_KLD             -0.084523
sentiment_avg       -0.109802
Slope_KLD           -0.112365
authoryearofbirth   -0.125898
Std_KLD             -0.129699
authoryearofdeath   -0.131066
Max_KLD             -0.132710
Range_KLD           -0.133086
speed               -0.154932
Name: log_downloads, dtype: float64


Independent variables which have strong positive or negative correlation values to predict the target variable log_downloads are considered.

So we are dropping the variables with low contribution in prediction like Intercept_KLD, Min_KLD, Median_KLD, Mean_KLD

In [94]:
numeric_book_level_measures = numeric_book_level_measures.drop(columns = ['Intercept_KLD', 'Min_KLD', 'Median_KLD', 'Mean_KLD', 'authoryearofbirth', 'authoryearofdeath'])
numeric_book_level_measures.head()

Unnamed: 0,downloads,Max_KLD,Range_KLD,Var_KLD,Std_KLD,Slope_KLD,Entropy_KLD,speed,sentiment_avg,sentiment_vol,wordcount,log_downloads
0,21.0,0.31679,0.117856,0.000512,0.022619,-0.00039,3.887504,0.171519,0.12578,0.00763,209058.0,3.044522
1,677.0,0.463228,0.257751,0.002585,0.050845,0.000474,3.876257,0.248672,0.13318,0.00425,52465.0,6.517671
2,2778.0,0.318618,0.125765,0.000532,0.023067,4.8e-05,3.886962,0.175747,0.16867,0.00528,82944.0,7.929487
3,225.0,0.323813,0.120303,0.00048,0.021918,-7.9e-05,3.88754,0.209854,0.1025,0.004305,73714.0,5.4161
4,555.0,0.28742,0.094414,0.000455,0.021327,8.7e-05,3.887546,0.167575,0.12513,0.007081,136528.0,6.318968


### Multi-Collinearity - Ridge & Lasso Regression

In [95]:
# Remove infinite values
book_level_measures = book_level_measures.replace([np.inf, -np.inf], np.nan)
# Drop rows with NaN values
book_level_measures = book_level_measures.dropna()

In [96]:
# Remove infinite values
numeric_book_level_measures = numeric_book_level_measures.replace([np.inf, -np.inf], np.nan)
# Drop rows with NaN values
numeric_book_level_measures = numeric_book_level_measures.dropna()

### VIF Values

VIF stands for Variance Inflation Factor.It signifies multicollinearity among set of different independent variable. A value greater than 10 means that the independent variable is highly correlated with some other variable. Will calculate VIF value for all variables to check the dependency and then will drop the variables with high VIF value.

In [97]:
# Compute VIF
vif = pd.DataFrame()
vif["features"] = numeric_book_level_measures.columns
vif["VIF Factor"] = [variance_inflation_factor(numeric_book_level_measures.values, i) for i in range(numeric_book_level_measures.shape[1])]
vif

Unnamed: 0,features,VIF Factor
0,downloads,1.20475
1,Max_KLD,2316.243135
2,Range_KLD,634.029333
3,Var_KLD,22.287814
4,Std_KLD,187.455196
5,Slope_KLD,2.276341
6,Entropy_KLD,637.862977
7,speed,146.555583
8,sentiment_avg,47.490753
9,sentiment_vol,23.561803


### OLS Regression Analysis

In [98]:
X = numeric_book_level_measures.drop(columns=['log_downloads'])
y = numeric_book_level_measures['log_downloads']

# Add constant to a model
X = sm.add_constant(X)

# Fit the OLS regression model
model = sm.OLS(y, X).fit()

# Print the model summary
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:          log_downloads   R-squared:                       0.211
Model:                            OLS   Adj. R-squared:                  0.210
Method:                 Least Squares   F-statistic:                     207.2
Date:                Wed, 03 Jul 2024   Prob (F-statistic):               0.00
Time:                        06:59:50   Log-Likelihood:                -12536.
No. Observations:                8541   AIC:                         2.510e+04
Df Residuals:                    8529   BIC:                         2.518e+04
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const            18.7831     73.500      0.256

### Heterogenity across Genres

In [101]:
# Investigate heterogeneity of effects across genres using LASSO
genre_columns = [col for col in extra_controls.columns if col.startswith('subj2_')]
X_lasso = book_level_measures[['downloads', 'Max_KLD', 'Range_KLD', 'Var_KLD', 'Std_KLD', 'Slope_KLD', 'Entropy_KLD'] + genre_columns + ['speed', 'sentiment_avg', 'sentiment_vol', 'wordcount']]
X_lasso = sm.add_constant(X_lasso)

lasso = LassoCV(cv=5, random_state=0).fit(X_lasso, y)

# Print LASSO coefficients
lasso_coefs = pd.Series(lasso.coef_, index=X_lasso.columns)
print("LASSO coefficients:")
print(lasso_coefs[lasso_coefs != 0])

# # Save the merged data to a CSV file
# merged_data.to_csv('/mnt/data/merged_book_measures.csv', index=False)

# # Display the first few rows of the merged data
# print(book_level_measures.head())

LASSO coefficients:
downloads    0.000693
wordcount    0.000003
dtype: float64
