In [1]:
import pandas as pd
import mysql.connector
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import RFE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import RepeatedKFold, KFold
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB
import xgboost as xgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb

%matplotlib inline

In [3]:

# Connect to SQL Server
conn = mysql.connector.connect(host="localhost", user="root", password="root", database="GooglePlayStore")

# Fetch Data
query = "SELECT * FROM rating"
df = pd.read_sql(query, conn)

# Close connection
conn.close()

df = df.drop('id', axis=1)
# Show first few rows
print(df.head())



  df = pd.read_sql(query, conn)


   Category  Rating_Count  Installs  Free  Size_in_Mb  Content_Rating  \
0   2.24681         262.0   10000.0     1    4.200000         2.18234   
1   2.70581           8.0      10.0     1   36.000000         2.18234   
2   2.45978        2352.0  500000.0     1   26.000000         2.18234   
3   1.93159          70.0   10000.0     1    0.097656         2.18234   
4   1.93159           0.0      10.0     1    2.900000         2.18234   

   Ad_Supported  In_App_Purchases  Editors_Choice  Transformed_Rating  
0             0                 1               0                   5  
1             1                 0               0                   5  
2             0                 0               0                   3  
3             0                 0               0                   3  
4             0                 0               0                   0  


In [4]:
df.Transformed_Rating.value_counts()

Transformed_Rating
0    92553
4    50957
5    36943
3    15689
2     3489
1      369
Name: count, dtype: int64

## Feature Extraction

In [5]:
df.shape

(200000, 10)

In [17]:
# Feature and Target
X = df.drop('Transformed_Rating', axis=1)
Y = df['Transformed_Rating'].astype(int)  # Ensure categorical target

# Split data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=7, stratify=Y)

# Train Decision Tree and Calculate Feature Importance
dt = DecisionTreeClassifier(random_state=7)
dt.fit(X_train, Y_train)

# Mean importance threshold
mean_imp = dt.feature_importances_.mean()
selected_features = X_train.columns[dt.feature_importances_ > mean_imp]
print("Selected Features Based on Decision Tree Importance:")
print(selected_features)

# Recursive Feature Elimination (RFE)
rfe = RFE(estimator=DecisionTreeClassifier(random_state=7))
rfe.fit(X_train, Y_train)

# Print feature rankings
print("\nRFE Feature Rankings:")
for rank, feature in sorted(zip(rfe.ranking_, X_train.columns)):
    print(f"{feature}: {rank}")


Selected Features Based on Decision Tree Importance:
Index(['Rating_Count', 'Size_in_Mb'], dtype='object')

RFE Feature Rankings:
Category: 1
Content_Rating: 1
Rating_Count: 1
Size_in_Mb: 1
Installs: 2
In_App_Purchases: 3
Ad_Supported: 4
Free: 5
Editors_Choice: 6


In [18]:
target_col = "Transformed_Rating"
class_counts = df["Transformed_Rating"].value_counts()

# Print class distribution
print("Class Distribution:")
print(class_counts)

Class Distribution:
Transformed_Rating
0    92553
4    50957
5    36943
3    15689
2     3489
1      369
Name: count, dtype: int64


In [None]:
#Applying Model Logistic Regression

logreg_c=LogisticRegression(max_iter=500, random_state=7, class_weight='balanced')
logreg_c.fit(X_train,Y_train)
logreg_pred=logreg_c.predict(X_test)
logreg_cm=confusion_matrix(Y_test,logreg_pred)
logreg_ac=accuracy_score(Y_test, logreg_pred)
print('LogisticRegression_accuracy:',logreg_ac)

In [None]:
#Applying Model RandomForest

rdf_c=RandomForestClassifier(random_state=7)
rdf_c.fit(X_train,Y_train)
rdf_pred=rdf_c.predict(X_test)
rdf_cm=confusion_matrix(Y_test,rdf_pred)
rdf_ac=accuracy_score(rdf_pred,Y_test)
print('RandomForest_Accuracy: ', rdf_ac)

In [None]:
#Applying Model DecisionTree Classifier

dtree_c=DecisionTreeClassifier(random_state=7,criterion='entropy', max_depth = 10, min_samples_leaf = 2, min_samples_split = 5)
dtree_c.fit(X_train,Y_train)
dtree_pred=dtree_c.predict(X_test)
dtree_cm=confusion_matrix(Y_test,dtree_pred)
dtree_ac=accuracy_score(dtree_pred,Y_test)
print('DecisionTreeClassifier_Accuracy: ',dtree_ac)

In [None]:
# Define hyperparameter grid
param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0],  
    'binarize': [0.0, 0.5, 1.0],  
    'fit_prior': [True, False]
}

# Initialize model
NB = BernoulliNB()

# Perform GridSearchCV
grid_search = GridSearchCV(NB, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, Y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

In [None]:
#Applying Model Naive Bayesian

NB = BernoulliNB(binarize = 0.0)
NB.fit(X_train,Y_train)
y_pred = NB.predict(X_test)
nb_ac=accuracy_score(Y_test, y_pred)
print("Bernoulli Naive Bayes_Accuracy: ", nb_ac)

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgbc = HistGradientBoostingClassifier(max_iter=100, learning_rate=0.1, random_state=7)
hgbc.fit(X_train, Y_train)
y_pred = hgbc.predict(X_test)
hgbc_ac = accuracy_score(Y_test, y_pred)

print("HistGradientBoosting Classifier Accuracy:", hgbc_ac)

In [None]:
# Applying AdaBoost Classifier

abc = AdaBoostClassifier(n_estimators=100, learning_rate=0.1, random_state=7)
abc.fit(X_train, Y_train)
y_pred = abc.predict(X_test)
abc_ac = accuracy_score(Y_test, y_pred)

print("AdaBoost Classifier Accuracy:", abc_ac)


In [None]:
# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'gamma': [0, 0.1, 0.2]
}

# Initialize model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=7)

# Perform GridSearchCV
grid_search = GridSearchCV(xgb, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, Y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)



In [None]:
# Applying XGBoost Classifier

xgb = XGBClassifier(n_estimators=100, learning_rate=0.05, random_state=7, use_label_encoder=False, eval_metric='logloss', max_depth=9, gamma=0.5)
xgb.fit(X_train, Y_train)
y_pred = xgb.predict(X_test)
xgb_ac = accuracy_score(Y_test, y_pred)

print("XGBoost Classifier Accuracy:", xgb_ac)

In [None]:
# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [-1, 5, 10],  # -1 means no limit
}

# Initialize model
lgb_model = lgb.LGBMClassifier(random_state=7)

# Perform GridSearchCV
grid_search = GridSearchCV(lgb_model, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, Y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

In [None]:
# Initialize LightGBM classifier
lgb_model = lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=7)

# Train the model
lgb_model.fit(X_train, Y_train)

# Predict on test data
y_pred = lgb_model.predict(X_test)

# Evaluate the model
lgb_ac = accuracy_score(Y_test, y_pred)
print("LightGBM Classifier Accuracy:", lgb_ac)

In [None]:
# Fine-tuning CatBoost Classifier
param_grid_catboost = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.1, 0.2],
    'depth': [4, 6, 8]
}

catboost = CatBoostClassifier(verbose=0)
grid_search_catboost = GridSearchCV(catboost, param_grid_catboost, cv=5, scoring='accuracy', n_jobs=-1)
grid_search_catboost.fit(X_train, y_train)

# Best parameters and score for CatBoost
print("Best Parameters (CatBoost):", grid_search_catboost.best_params_)
print("Best Accuracy (CatBoost):", grid_search_catboost.best_score_)

In [None]:
#Applying Model CatBoost Model

Cat_Boost = CatBoostClassifier(verbose = 0, n_estimators = 100)
Cat_Boost.fit(X_train, Y_train)
cb_ac=Cat_Boost.score(X_train, Y_train)
print("CatBoost_Accuracy: ",cb_ac)