In [None]:
# data: https://www.kaggle.com/datasets/iabhishekofficial/mobile-price-classification

# Classification models

In [81]:
import pandas as pd
import sklearn as sk
from sklearn import model_selection
from sklearn import tree
from sklearn import neighbors
from sklearn import svm

In [67]:
# Mobile phone data
# I want to predict "price_range"
df = pd.read_csv("train.csv")
df["px_screen_area"] = df["px_height"] * df["px_width"]

df.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range,px_screen_area
0,842,0,2.2,0,1,0,7,0.6,188,2,...,756,2549,9,7,19,0,0,1,1,15120
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,1988,2631,17,3,7,1,1,0,2,1799140
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1716,2603,11,2,9,1,1,0,2,2167308
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1786,2769,16,8,11,1,0,0,2,2171776
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1212,1411,8,2,15,1,1,0,1,1464096


In [None]:
# check if all variables are numbers
# df.dtypes # ok

# check if there are no NaN values
# df.isna().sum() # all zeros --> ok

In [74]:
X = df[df.columns.drop("price_range")]

# Normalize the data
X = (X - X.min()) / (X.max() - X.min())

y = df["price_range"]

In [75]:
X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(X, y, train_size=0.8, random_state=42)

In [None]:
# model training

# # TREES
# for criterion in ["entropy", "gini"]:
#     for max_depth in range(3, 20):
#         clf = sk.tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=42)
#         clf.fit(X_train, y_train)

#         y_pred = clf.predict(X_test)
#         print("tree -->", criterion, max_depth, "-->", sk.metrics.accuracy_score(y_test, y_pred))

## KNN
# for weights in ["uniform", "distance"]:
#     for n_neighbors in range(21, 101, 4):
#         clf = sk.neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights)
#         clf.fit(X_train, y_train)

#         y_pred = clf.predict(X_test)
#         print("knn -->", weights, n_neighbors, "-->", sk.metrics.accuracy_score(y_test, y_pred))

## SVM
# for kernel in ["linear", "poly", "rbf", "sigmoid"]:
#     clf = svm.SVC(C=1.0, kernel=kernel, random_state=42)

#     clf.fit(X_train, y_train)

#     y_pred = clf.predict(X_test)
#     print("SVM -->", kernel, "-->", sk.metrics.accuracy_score(y_test, y_pred))

In [None]:
# 0.74 --> original data, sk.tree.DecisionTreeClassifier(max_depth=3, random_state=42)
# 0.735 --> px_screen_area, sk.tree.DecisionTreeClassifier(max_depth=3, random_state=42)
# 0.75 --> criterion='entropy'
# 0.875 --> criterion='entropy' max_depth=8
criterion = "entropy"
max_depth = 8
clf = sk.tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth, random_state=42)
clf.fit(X_train,y_train)
sk.tree.plot_tree(clf, feature_names=X.columns);


In [95]:
param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
  {'C': [1, 10, 100, 1000], 'kernel': ['poly']},
  {'C': [1, 10, 100, 1000], 'kernel': ['sigmoid']},
 ]

clf = svm.SVC(random_state=42)
grid = sk.model_selection.GridSearchCV(clf, param_grid, scoring="accuracy", n_jobs=-1)

grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print("SVM -->", sk.metrics.accuracy_score(y_test, y_pred))

print(grid.best_params_)

SVM --> 0.97
{'C': 1000, 'kernel': 'linear'}


In [98]:
param_grid = [{
    'criterion': ["entropy", "gini"],
    'max_depth': range(3, 20),
    "splitter": ["best", "random"],
    "min_samples_leaf": range(5, 20, 5),
    "min_samples_split": range(10, 50, 5)
}]

clf = sk.tree.DecisionTreeClassifier(random_state=42)
grid = sk.model_selection.GridSearchCV(clf, param_grid, scoring="accuracy", n_jobs=-1)

grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print("TREE -->", sk.metrics.accuracy_score(y_test, y_pred))
print(grid.best_params_)

TREE --> 0.865
{'criterion': 'entropy', 'max_depth': 9, 'min_samples_leaf': 5, 'min_samples_split': 10, 'splitter': 'best'}


In [104]:
param_grid = [{
    'weights': ["uniform", "distance"],
    'n_neighbors': (21, 101, 4),
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "metric": ["cityblock", "euclidean", "l1", "l2", "manhattan"]
}]

clf = sk.neighbors.KNeighborsClassifier()
grid = sk.model_selection.GridSearchCV(clf, param_grid, scoring="accuracy", n_jobs=-1)

grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)
print("KNN -->", sk.metrics.accuracy_score(y_test, y_pred))
print(grid.best_params_)

KNN --> 0.6975
{'algorithm': 'auto', 'metric': 'cityblock', 'n_neighbors': 101, 'weights': 'distance'}
