In [None]:
import pandas as pd
import mysql.connector
import numpy as np
import pickle

from sklearn.metrics import accuracy_score,f1_score

from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.ensemble import RandomForestClassifier
from imblearn.combine import SMOTETomek
from collections import Counter

%matplotlib inline

In [2]:
# Connect to SQL Server
conn = mysql.connector.connect(host="localhost", user="root", password="Root", database="GooglePlayStore")
# Fetch Data
query = "SELECT * FROM ratings_prediction"
df = pd.read_sql(query, conn)
conn.close()
df = df.drop(columns=['id','Category', 'Content_Rating'])
# Show first few rows
print(df.head())



   Category_Encoded  Size_in_Mb  Content_Rating_Encoded  Ad_Supported  \
0                 0        10.0                       0             0   
1                 1         2.9                       0             1   
2                 2         3.7                       0             0   
3                 3         1.8                       0             1   
4                 1         6.2                       0             0   

   In_App_Purchases  Installs  Free  Rating_Count  Editors_Choice  \
0                 0        10     1             0               0   
1                 0      5000     1            64               0   
2                 0        50     1             0               0   
3                 0        10     1             5               0   
4                 0       100     1             0               0   

   Transformed_Rating  
0                   0  
1                   4  
2                   0  
3                   5  
4                   0  


In [3]:
df.shape

(2312683, 10)

In [4]:
df.Transformed_Rating.value_counts()

0    1066841
4     588290
5     423009
3     189288
2      41028
1       4227
Name: Transformed_Rating, dtype: int64

In [5]:
zero_rating_rows = df[df["Transformed_Rating"] == 0]
rows_to_drop = zero_rating_rows.sample(n=700000, random_state=42)
# Drop these rows from the original DataFrame
df = df.drop(rows_to_drop.index).reset_index(drop=True)
print("New DataFrame Shape:", df.shape)

New DataFrame Shape: (1612683, 10)


In [6]:
df.Transformed_Rating.value_counts()

4    588290
5    423009
0    366841
3    189288
2     41028
1      4227
Name: Transformed_Rating, dtype: int64

In [7]:
# Load dataset
X = df.drop("Transformed_Rating", axis=1)  # Features
Y = df["Transformed_Rating"]  # Target variable

print("🔹 Original Class Distribution:", Counter(Y))
print("New DataFrame Shape:", df.shape)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y,test_size=0.3, random_state=7,stratify=Y)
smote_tomek = SMOTETomek(random_state=7)
X_train_resampled, Y_train_resampled = smote_tomek.fit_resample(X_train, Y_train)
print("🔹 After SMOTETomek:", Counter(Y_train_resampled))
 
print(f"X shape: {X_train_resampled.shape}, Y shape: {Y_train_resampled.shape}")

🔹 Original Class Distribution: Counter({4: 588290, 5: 423009, 0: 366841, 3: 189288, 2: 41028, 1: 4227})
New DataFrame Shape: (1612683, 10)
🔹 After SMOTETomek: Counter({0: 411793, 1: 406872, 2: 394347, 3: 382047, 5: 371337, 4: 360302})
X shape: (2326698, 9), Y shape: (2326698,)


In [8]:
df.shape

(1612683, 10)

In [11]:
# Initialize and train Bernoulli Naive Bayes
NB = BernoulliNB(binarize=0.0)
NB.fit(X_train_resampled, Y_train_resampled)

y_pred = NB.predict(X_test)
y_prob = NB.predict_proba(X_test)

nb_ac = accuracy_score(Y_test, y_pred)
print("\nBernoulli Naive Bayes Accuracy:", nb_ac)

f1 = f1_score(Y_test, y_pred, average="weighted")
print("\nF1 Score:", f1)

with open("NBmodel.pkl", "wb") as f:
    pickle.dump(NB, f)



Bernoulli Naive Bayes Accuracy: 0.47844482797821436

F1 Score: 0.4498093448726393


In [None]:
# Initialize and train RandomForestClassifier
rdf_c = RandomForestClassifier(random_state=7)
rdf_c.fit(X_train_resampled, Y_train_resampled)

rdf_pred = rdf_c.predict(X_test)
rdf_prob = rdf_c.predict_proba(X_test)

rdf_ac = accuracy_score(Y_test, rdf_pred)
print("\nRandomForest Accuracy:", rdf_ac)

f1 = f1_score(Y_test, rdf_pred, average="weighted")
print("\nF1 Score:", f1)

In [None]:
# Initialize and train DecisionTreeClassifier
dtree_c = DecisionTreeClassifier(random_state=7, criterion='entropy', max_depth=10, min_samples_leaf=2, min_samples_split=5)
dtree_c.fit(X_train_resampled, Y_train_resampled)

dtree_pred = dtree_c.predict(X_test)
dtree_prob = dtree_c.predict_proba(X_test)    

dtree_ac = accuracy_score(Y_test, dtree_pred)
print("\nDecisionTreeClassifier Accuracy:", dtree_ac)

f1 = f1_score(Y_test, dtree_pred, average="weighted")
print("\nF1 Score:", f1)

In [None]:
# Initialize and train the model
hgbc = HistGradientBoostingClassifier(max_iter=100, learning_rate=0.1, random_state=7)
hgbc.fit(X_train_resampled, Y_train_resampled)

y_pred = hgbc.predict(X_test)
y_prob = hgbc.predict_proba(X_test)

hgbc_ac = accuracy_score(Y_test, y_pred)
print("HistGradientBoosting Classifier Accuracy:", hgbc_ac)

f1 = f1_score(Y_test, y_pred, average="weighted")
print("\nF1 Score:", f1)

In [10]:
# Initialize and train CatBoost Classifier
Cat_Boost = CatBoostClassifier(verbose=300, n_estimators=4000, learning_rate=0.3, depth=10, early_stopping_rounds=300)
Cat_Boost.fit(X_train_resampled, Y_train_resampled)
 
cb_ac = Cat_Boost.score(X_train_resampled, Y_train_resampled)
print("\nCatBoost Accuracy:", cb_ac)

y_pred = Cat_Boost.predict(X_test)
y_prob = Cat_Boost.predict_proba(X_test)

f1 = f1_score(Y_test, y_pred, average="weighted")
print("\nF1 Score:", f1)

# Save model
with open("model.pkl", "wb") as f:
    pickle.dump(Cat_Boost, f)

0:	learn: 1.4409385	total: 3.28s	remaining: 3h 38m 40s


KeyboardInterrupt: 