In [1]:
import pandas as pd

In [2]:
#Load the dataset
X = pd.read_csv(r"C:\Users\Windows 10\Desktop\mobile_price_classification\dataset\train.csv")
y = X["price_range"]
X = X.drop(["price_range"],axis=1)

In [3]:
from sklearn.preprocessing import StandardScaler
"""
Applying Feature scaling
Features to be scaled:
    --battery_power
    --px_height
    --px_width
    --ram
"""
#Choosing features for scaling
scale = X[["battery_power","px_height","px_width","ram"]].copy()
print(scale.describe())

#Scale features
scaler = StandardScaler()
scaler.fit_transform(scale)
scale = scaler.transform(scale)
print(scale)

       battery_power    px_height     px_width          ram
count    2000.000000  2000.000000  2000.000000  2000.000000
mean     1238.518500   645.108000  1251.515500  2124.213000
std       439.418206   443.780811   432.199447  1084.732044
min       501.000000     0.000000   500.000000   256.000000
25%       851.750000   282.750000   874.750000  1207.500000
50%      1226.000000   564.000000  1247.000000  2146.500000
75%      1615.250000   947.250000  1633.000000  3064.500000
max      1998.000000  1960.000000  1998.000000  3998.000000
[[-0.90259726 -1.40894856 -1.14678403  0.39170341]
 [-0.49513857  0.58577791  1.70446468  0.46731702]
 [-1.5376865   1.39268422  1.07496821  0.44149774]
 ...
 [ 1.53077336  0.50238257  0.88056489  0.86013895]
 [ 0.62252745 -0.69670724 -1.34581601 -1.15745352]
 [-1.65833069 -0.3653798  -1.15141268  1.65500399]]


In [4]:
#Replacing with the scaled data
X["battery_power"] = scale[:,0]
X["px_height"] = scale[:,1]
X["px_width"] = scale[:,2]
X["ram"] = scale[:,3]

print("After scaling:")
X.describe()

After scaling:


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
count,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0,2000.0
mean,2.149392e-16,0.495,1.52225,0.5095,4.3095,0.5215,32.0465,0.50175,140.249,4.5205,9.9165,1.181277e-16,6.084022e-17,-1.811884e-16,12.3065,5.767,11.011,0.7615,0.503,0.507
std,1.00025,0.5001,0.816004,0.500035,4.341444,0.499662,18.145715,0.288416,35.399655,2.287837,6.064315,1.00025,1.00025,1.00025,4.213245,4.356398,5.463955,0.426273,0.500116,0.500076
min,-1.678817,0.0,0.5,0.0,0.0,0.0,2.0,0.1,80.0,1.0,0.0,-1.454027,-1.739251,-1.722711,5.0,0.0,2.0,0.0,0.0,0.0
25%,-0.8804033,0.0,0.7,0.0,1.0,0.0,16.0,0.2,109.0,3.0,5.0,-0.8167289,-0.8719579,-0.8453168,9.0,2.0,6.0,1.0,0.0,0.0
50%,-0.02849593,0.0,1.5,1.0,3.0,1.0,32.0,0.5,141.0,4.0,10.0,-0.1828116,-0.01045034,0.02055123,12.0,5.0,11.0,1.0,1.0,1.0
75%,0.857556,1.0,2.2,1.0,7.0,1.0,48.0,0.8,170.0,7.0,15.0,0.6810064,0.8828792,0.8670548,16.0,9.0,16.0,1.0,1.0,1.0
max,1.728812,1.0,3.0,1.0,19.0,1.0,64.0,1.0,200.0,8.0,20.0,2.963672,1.727608,1.727851,19.0,18.0,20.0,1.0,1.0,1.0


In [5]:
#Splitting into training and validation sets
X_train = X[:1800]
X_val = X[1800:]
y_train = y[:1800]
y_val = y[1800:]

In [6]:
#Defining a Decision Tree
from sklearn.tree import DecisionTreeClassifier
dt_classifier = DecisionTreeClassifier()

In [7]:
#Instantiating GridSearchCV with hyperparameter grid
from sklearn.model_selection import GridSearchCV

param_grid = {
    'criterion': ['gini', 'entropy'],
    #'max_depth': [None, 5, 10,20,30,40,50],
    'min_samples_split': [2, 5, 10,16,20,30,40],
    'min_samples_leaf': [1, 2, 4,8,10,16,20,30,40],
    #'max_features': ['auto', 'sqrt', 'log2'],
    'splitter': ['best', 'random']
}

grid_search = GridSearchCV(dt_classifier,param_grid,cv=20)

In [8]:
#Fit GridSearchCV
grid_search.fit(X_train,y_train)

In [9]:
#Printing out best parameters and best score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best parameters: {grid_search.best_score_}")

Best parameters: {'criterion': 'entropy', 'max_depth': 50, 'min_samples_leaf': 4, 'min_samples_split': 2, 'splitter': 'best'}
Best parameters: 0.8611111111111113


In [15]:
#Training a DecisionTreeClassifier with the best parameters
dt_classifier = DecisionTreeClassifier(criterion='entropy',max_depth=50,min_samples_leaf=4,min_samples_split=2,splitter='best')
dt_classifier.fit(X_train,y_train)

In [16]:
#Making an inference
y_hat = dt_classifier.predict(X_val)

In [12]:
#Converting pandas y_val to numpy y_val
y_val = y_val.to_numpy()

In [17]:
#Measuring accuracy
counter = 0
correct = 0
equal = False
while counter<len(y_val):
    #y_hat[counter] = int(y_hat[counter])
    equal = y_hat[counter]==y_val[counter]
    if equal:
        correct+=1
    counter+=1

In [18]:
#Accuracy on val set
print(f"Accuracy on validation set: {(correct/len(y_val))*100}%")

Accuracy on validation set: 86.5%


In [19]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_val,y_hat)
print(f"Confusion matrix: \n{cm}")

Confusion matrix: 
[[57  6  0  0]
 [ 5 42  2  0]
 [ 0  4 34  4]
 [ 0  0  6 40]]
