In [6]:
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('archive/cleaned/1_git_books.csv')


def rating_to_label(r):
    if r >= 4.75:
        return "4.5-4.75"
    elif r >= 4.5:
        return "4.5-4.75"
    elif r >= 4.25:
        return "4.25-4.5"
    elif r >= 4.0:
        return "4.0-4.25"
    elif r >= 3.75:
        return "3.75-4.0"
    elif r >= 3.5:
        return "3.5-3.75"
    elif r >= 3.25:
        return "3.25-3.5"
    elif r >= 3:
        return "3.0-3.25"
    elif r >= 2.75:
        return "2.75-3.0"
    elif r >= 2.5:
        return "2.5-2.75"
    else:
        return "Low"

data['rating_label'] = data['rating'].apply(rating_to_label)

X = data[['title']]  # use only title here
y = data['rating_label']

X = [[None]] * len(data)  # dummy inputs

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

dummy = DummyClassifier(strategy='most_frequent')  # or 'stratified', 'uniform', etc.
dummy.fit(X_train, y_train)


new_books = [['Book X'], ['Book Y'], ['Book Z']]  
predictions = dummy.predict([[None], [None], [None]])

print("Predicted ratings:", predictions)


Predicted ratings: ['4.0-4.25' '4.0-4.25' '4.0-4.25']


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


data = pd.read_csv('archive/cleaned/1_git_books.csv')


def rating_to_label(r):
    if r >= 4.75:
        return "4.5-4.75"
    elif r >= 4.5:
        return "4.5-4.75"
    elif r >= 4.25:
        return "4.25-4.5"
    elif r >= 4.0:
        return "4.0-4.25"
    elif r >= 3.75:
        return "3.75-4.0"
    elif r >= 3.5:
        return "3.5-3.75"
    elif r >= 3.25:
        return "3.25-3.5"
    elif r >= 3:
        return "3.0-3.25"
    elif r >= 2.75:
        return "2.75-3.0"
    elif r >= 2.5:
        return "2.5-2.75"
    else:
        return "Low"

data['rating_label'] = data['rating'].apply(rating_to_label)

le_genre = LabelEncoder()
le_author = LabelEncoder()

data['genre_enc'] = le_genre.fit_transform(data['genre'])
data['author_enc'] = le_author.fit_transform(data['author'])

X = data[['genre_enc', 'author_enc']]
y = data['rating_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [8]:
dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_test)

print("=== Dummy Classifier ===")
print("Accuracy:", accuracy_score(y_test, y_pred_dummy))
print("Classification Report:\n", classification_report(y_test, y_pred_dummy))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dummy))


=== Dummy Classifier ===
Accuracy: 0.37570394207562346
Classification Report:
               precision    recall  f1-score   support

    2.75-3.0       0.00      0.00      0.00         3
    3.0-3.25       0.00      0.00      0.00         9
    3.25-3.5       0.00      0.00      0.00        62
    3.5-3.75       0.00      0.00      0.00       269
    3.75-4.0       0.00      0.00      0.00       803
    4.0-4.25       0.38      1.00      0.55       934
    4.25-4.5       0.00      0.00      0.00       369
    4.5-4.75       0.00      0.00      0.00        37

    accuracy                           0.38      2486
   macro avg       0.05      0.12      0.07      2486
weighted avg       0.14      0.38      0.21      2486

Confusion Matrix:
 [[  0   0   0   0   0   3   0   0]
 [  0   0   0   0   0   9   0   0]
 [  0   0   0   0   0  62   0   0]
 [  0   0   0   0   0 269   0   0]
 [  0   0   0   0   0 803   0   0]
 [  0   0   0   0   0 934   0   0]
 [  0   0   0   0   0 369   0   0]
 [  0 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\n=== Random Forest Classifier ===")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))


=== Random Forest Classifier ===
Accuracy: 0.45253419147224455
Classification Report:
               precision    recall  f1-score   support

    2.75-3.0       0.00      0.00      0.00         3
    3.0-3.25       0.00      0.00      0.00         9
    3.25-3.5       0.16      0.10      0.12        62
    3.5-3.75       0.27      0.24      0.25       269
    3.75-4.0       0.46      0.48      0.47       803
    4.0-4.25       0.51      0.54      0.52       934
    4.25-4.5       0.46      0.42      0.44       369
    4.5-4.75       0.27      0.16      0.20        37

    accuracy                           0.45      2486
   macro avg       0.27      0.24      0.25      2486
weighted avg       0.44      0.45      0.45      2486

Confusion Matrix:
 [[  0   0   1   0   1   1   0   0]
 [  0   0   2   1   4   1   1   0]
 [  0   1   6  23  23   9   0   0]
 [  1   3   8  64 121  61  11   0]
 [  0   1  10  87 389 273  42   1]
 [  0   2   7  47 259 505 113   1]
 [  0   0   4  12  48 136 155  1