In [67]:
import pandas as pd 
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from scipy.sparse import hstack
from sklearn.metrics import accuracy_score

# Read in the dataset
books = pd.read_csv("data/books.csv")

# Preview the first five rows of the dataset
books.head()

Unnamed: 0,title,price,review/helpfulness,review/summary,review/text,description,authors,categories,popularity
0,We Band of Angels: The Untold Story of America...,10.88,2/3,A Great Book about women in WWII,I have alway been a fan of fiction books set i...,"In the fall of 1941, the Philippines was a gar...",'Elizabeth Norman','History',Unpopular
1,Prayer That Brings Revival: Interceding for Go...,9.35,0/0,Very helpful book for church prayer groups and...,Very helpful book to give you a better prayer ...,"In Prayer That Brings Revival, best-selling au...",'Yong-gi Cho','Religion',Unpopular
2,The Mystical Journey from Jesus to Christ,24.95,17/19,Universal Spiritual Awakening Guide With Some ...,The message of this book is to find yourself a...,THE MYSTICAL JOURNEY FROM JESUS TO CHRIST Disc...,'Muata Ashby',"'Body, Mind & Spirit'",Unpopular
3,Death Row,7.99,0/1,Ben Kincaid tries to stop an execution.,The hero of William Bernhardt's Ben Kincaid no...,"Upon receiving his execution date, one of the ...",'Lynden Harris','Social Science',Unpopular
4,Sound and Form in Modern Poetry: Second Editio...,32.5,18/20,good introduction to modern prosody,There's a lot in this book which the reader wi...,An updated and expanded version of a classic a...,"'Harvey Seymour Gross', 'Robert McDowell'",'Poetry',Unpopular


In [68]:
# Feature selection (text-based features)
# Select columns related to review summary, review text, and review helpfulness for feature extraction
X = books[['review/summary', 'review/text', 'review/helpfulness']]

# Step 1: Split the 'review/helpfulness' column into two columns (good_reviews, total_reviews)
# The 'review/helpfulness' is split into two integers representing the number of helpful and total reviews
books[['good_reviews', 'total_reviews']] = books['review/helpfulness'].str.split('/', expand=True).astype(int)

# Preprocess price: Scale the price column using StandardScaler for normalization
scaler = StandardScaler()
books['price_scaled'] = scaler.fit_transform(books[['price']])

# Create a binary classification target based on popularity
# Map 'Popular' to 1 and 'Unpopular' to 0 for target variable
books['popularity_binary'] = books['popularity'].map({'Popular': 1, 'Unpopular': 0})

In [69]:
# Process review summary and review text using TF-IDF vectorization
# Convert the text data into a sparse matrix representation based on term frequencies and inverse document frequencies
tfidf = TfidfVectorizer(max_features=100)  # Limit to the top 100 features
review_summary_tfidf = tfidf.fit_transform(books['review/summary'].fillna(''))  # Handle missing data by filling with empty strings
review_text_tfidf = tfidf.fit_transform(books['review/text'].fillna(''))

In [70]:
# Process authors using label encoding to convert categorical data into numerical format
label_encoder = LabelEncoder()
books['author_encoded'] = label_encoder.fit_transform(books['authors'])

# Process categories: Create dummy variables for each category by splitting on commas
categories_dummies = books['categories'].str.get_dummies(sep=',')
books = pd.concat([books, categories_dummies], axis=1)

# Select numerical features (scaled price and review counts)
numerical_features = books[['price_scaled', 'good_reviews', 'total_reviews']].values

In [71]:
# Combine all features into a single feature matrix:
# - Numerical features (scaled price, review counts)
# - TF-IDF features (summary and text)
# - One-hot encoded categories
combined_features = hstack([numerical_features, review_summary_tfidf, review_text_tfidf, categories_dummies.values])

In [72]:
# Define target variable (popularity binary: 1 for popular, 0 for unpopular)
y = books['popularity_binary']

In [73]:
# Train-test split: Split the data into training and testing sets (80% for training, 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(combined_features, y, test_size=0.2, random_state=42)

In [74]:
# Train a Random Forest model with 100 estimators
model = RandomForestClassifier(
    n_estimators=100,           # Number of trees in the forest
    criterion='gini',           # Gini impurity for splitting nodes
    max_depth=None,             # No limit on the depth of trees
    min_samples_split=2,        # Minimum number of samples required to split a node
    random_state=42             # Ensuring reproducibility
)
model.fit(X_train, y_train)  # Fit the model to the training data

In [75]:
# Evaluate the model using accuracy score on the test set
model_accuracy = model.score(X_test, y_test)
print(f"Model Accuracy: {model_accuracy:.2f}")  # Print the model's accuracy on the test set

Model Accuracy: 0.75
