In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.experimental import enable_iterative_imputer
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler
from sklearn.impute import IterativeImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
df = pd.read_csv("phones-v1.csv")

In [3]:
df.head()

Unnamed: 0,Brand,Model,Storage (GB),RAM (GB),Launch,Dimensions,Weight (g),Display Type,Display Size,OS,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range
0,Apple,Apple iPhone X,256,3,2017-11-03,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,OLED,5.8,iOS,True,2716.0,Hexa-Core,2017,458,high price
1,Apple,Apple iPhone X,256,3,2017-11-03,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,OLED,5.8,iOS,True,2716.0,Hexa-Core,2017,458,low price
2,Samsung,Samsung Galaxy S9+,64,6,2018-03-01,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,low price
3,Samsung,Samsung Galaxy S9+,128,6,2018-03-01,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,medium price
4,Samsung,Samsung Galaxy S9+,256,6,2018-03-01,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,high price


In [4]:
df["Brand"].unique()

array(['Apple', 'Samsung', 'Xiaomi'], dtype=object)

In [5]:
df = pd.concat([df, pd.get_dummies(df["Brand"])], axis=1)
df = df.drop(columns=["Brand"])

In [6]:
df.head()

Unnamed: 0,Model,Storage (GB),RAM (GB),Launch,Dimensions,Weight (g),Display Type,Display Size,OS,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range,Apple,Samsung,Xiaomi
0,Apple iPhone X,256,3,2017-11-03,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,OLED,5.8,iOS,True,2716.0,Hexa-Core,2017,458,high price,True,False,False
1,Apple iPhone X,256,3,2017-11-03,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,OLED,5.8,iOS,True,2716.0,Hexa-Core,2017,458,low price,True,False,False
2,Samsung Galaxy S9+,64,6,2018-03-01,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,low price,False,True,False
3,Samsung Galaxy S9+,128,6,2018-03-01,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,medium price,False,True,False
4,Samsung Galaxy S9+,256,6,2018-03-01,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,high price,False,True,False


In [7]:
df["Model"].unique()

array(['Apple iPhone X', 'Samsung Galaxy S9+', 'Samsung Galaxy S9', nan,
       'Apple iPhone XS Max', 'Apple iPhone XS', 'Apple iPhone XR',
       'Samsung Galaxy S10e', 'Samsung Galaxy S10+', 'Samsung Galaxy S10',
       'Samsung Galaxy Note10+', 'Apple iPhone 11', 'Apple iPhone 11 Pro',
       'Apple iPhone 11 Pro Max', 'Samsung Galaxy A51',
       'Samsung Galaxy A71', 'Xiaomi Mi 10 5G', 'Samsung Galaxy S20 5G',
       'Samsung Galaxy S20 Ultra 5G', 'Samsung Galaxy S20+ 5G',
       'Apple iPhone SE (2020)', 'Samsung Galaxy A31',
       'Samsung Galaxy A51 5G', 'Xiaomi Redmi Note 9S',
       'Samsung Galaxy M11', 'Xiaomi Redmi Note 9 Pro',
       'Xiaomi Mi Note 10 Lite', 'Xiaomi Redmi Note 9',
       'Xiaomi Poco F2 Pro', 'Xiaomi Redmi 9', 'Samsung Galaxy A71 5G',
       'Xiaomi Redmi 9A', 'Xiaomi Redmi 9C', 'Samsung Galaxy Note20 5G',
       'Samsung Galaxy Note20 Ultra 5G', 'Xiaomi Poco X3 NFC',
       'Samsung Galaxy S20 FE 5G', 'Xiaomi Mi 10T Pro 5G',
       'Apple iPhone 12', 

In [8]:
df = df.drop(columns=["Model"])

In [9]:
df.head()

Unnamed: 0,Storage (GB),RAM (GB),Launch,Dimensions,Weight (g),Display Type,Display Size,OS,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range,Apple,Samsung,Xiaomi
0,256,3,2017-11-03,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,OLED,5.8,iOS,True,2716.0,Hexa-Core,2017,458,high price,True,False,False
1,256,3,2017-11-03,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,OLED,5.8,iOS,True,2716.0,Hexa-Core,2017,458,low price,True,False,False
2,64,6,2018-03-01,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,low price,False,True,False
3,128,6,2018-03-01,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,medium price,False,True,False
4,256,6,2018-03-01,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,high price,False,True,False


In [10]:
df = df.drop(columns=["Launch"])

In [11]:
df.head()

Unnamed: 0,Storage (GB),RAM (GB),Dimensions,Weight (g),Display Type,Display Size,OS,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range,Apple,Samsung,Xiaomi
0,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,OLED,5.8,iOS,True,2716.0,Hexa-Core,2017,458,high price,True,False,False
1,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,OLED,5.8,iOS,True,2716.0,Hexa-Core,2017,458,low price,True,False,False
2,64,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,low price,False,True,False
3,128,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,medium price,False,True,False
4,256,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,AMOLED,6.2,Android,True,3500.0,Octa-Core,2018,529,high price,False,True,False


In [12]:
df.isnull().sum()

Storage (GB)      0
RAM (GB)          0
Dimensions        0
Weight (g)        0
Display Type      0
Display Size      0
OS                0
NFC               0
Battery (mAh)    48
CPU               0
Year              0
PPI Density       0
Price Range      15
Apple             0
Samsung           0
Xiaomi            0
dtype: int64

In [13]:
df["Display Type"].unique()

array(['OLED', 'AMOLED', 'LCD'], dtype=object)

In [14]:
df = pd.concat([df, pd.get_dummies(df["Display Type"])], axis=1)
df = df.drop(columns=["Display Type"])

In [15]:
df.head()

Unnamed: 0,Storage (GB),RAM (GB),Dimensions,Weight (g),Display Size,OS,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range,Apple,Samsung,Xiaomi,AMOLED,LCD,OLED
0,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,5.8,iOS,True,2716.0,Hexa-Core,2017,458,high price,True,False,False,False,False,True
1,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,5.8,iOS,True,2716.0,Hexa-Core,2017,458,low price,True,False,False,False,False,True
2,64,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,Android,True,3500.0,Octa-Core,2018,529,low price,False,True,False,True,False,False
3,128,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,Android,True,3500.0,Octa-Core,2018,529,medium price,False,True,False,True,False,False
4,256,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,Android,True,3500.0,Octa-Core,2018,529,high price,False,True,False,True,False,False


In [16]:
df["OS"].unique()

array(['iOS', 'Android'], dtype=object)

In [17]:
df["OS"] = df["OS"].map({"iOS": 0, "Android": 1})

In [18]:
df.head()

Unnamed: 0,Storage (GB),RAM (GB),Dimensions,Weight (g),Display Size,OS,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range,Apple,Samsung,Xiaomi,AMOLED,LCD,OLED
0,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,5.8,0,True,2716.0,Hexa-Core,2017,458,high price,True,False,False,False,False,True
1,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,5.8,0,True,2716.0,Hexa-Core,2017,458,low price,True,False,False,False,False,True
2,64,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,1,True,3500.0,Octa-Core,2018,529,low price,False,True,False,True,False,False
3,128,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,1,True,3500.0,Octa-Core,2018,529,medium price,False,True,False,True,False,False
4,256,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,1,True,3500.0,Octa-Core,2018,529,high price,False,True,False,True,False,False


In [19]:
df["CPU"].unique()

array(['Hexa-Core', 'Octa-Core', 'Quad-Core'], dtype=object)

In [20]:
df["CPU"] = df["CPU"].map({"Quad-Core": 4, "Hexa-Core": 6, "Octa-Core": 8})

In [21]:
df.head()

Unnamed: 0,Storage (GB),RAM (GB),Dimensions,Weight (g),Display Size,OS,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range,Apple,Samsung,Xiaomi,AMOLED,LCD,OLED
0,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,5.8,0,True,2716.0,6,2017,458,high price,True,False,False,False,False,True
1,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,5.8,0,True,2716.0,6,2017,458,low price,True,False,False,False,False,True
2,64,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,1,True,3500.0,8,2018,529,low price,False,True,False,True,False,False
3,128,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,1,True,3500.0,8,2018,529,medium price,False,True,False,True,False,False
4,256,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,1,True,3500.0,8,2018,529,high price,False,True,False,True,False,False


In [22]:
df["Price Range"].unique()

array(['high price', 'low price', 'medium price', nan], dtype=object)

In [23]:
ordinal_encoder = OrdinalEncoder(
    categories=[["low price", "medium price", "high price"]],
    handle_unknown="use_encoded_value",
    unknown_value=np.nan,
)

In [24]:
df[["Price Range"]] = ordinal_encoder.fit_transform(df[["Price Range"]])

In [25]:
df.head()

Unnamed: 0,Storage (GB),RAM (GB),Dimensions,Weight (g),Display Size,OS,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range,Apple,Samsung,Xiaomi,AMOLED,LCD,OLED
0,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,5.8,0,True,2716.0,6,2017,458,2.0,True,False,False,False,False,True
1,256,3,143.6 x 70.9 x 7.7 mm (5.65 x 2.79 x 0.30 in),174.0,5.8,0,True,2716.0,6,2017,458,0.0,True,False,False,False,False,True
2,64,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,1,True,3500.0,8,2018,529,0.0,False,True,False,True,False,False
3,128,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,1,True,3500.0,8,2018,529,1.0,False,True,False,True,False,False
4,256,6,158.1 x 73.8 x 8.5 mm (6.22 x 2.91 x 0.33 in),189.0,6.2,1,True,3500.0,8,2018,529,2.0,False,True,False,True,False,False


In [26]:
def preprocess_dimensions(value):
    # 143.6 x 70.9 x 7.7 mm
    if not pd.isna(value):
        values = value.split(" mm")[0].split(" x ")
        return [float(value) for value in values]
    return value

In [27]:
df[["Height", "Width", "Thickness"]] = df["Dimensions"].apply(preprocess_dimensions).tolist()

In [28]:
df = df.drop(columns=["Dimensions"])

In [29]:
df.head()

Unnamed: 0,Storage (GB),RAM (GB),Weight (g),Display Size,OS,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range,Apple,Samsung,Xiaomi,AMOLED,LCD,OLED,Height,Width,Thickness
0,256,3,174.0,5.8,0,True,2716.0,6,2017,458,2.0,True,False,False,False,False,True,143.6,70.9,7.7
1,256,3,174.0,5.8,0,True,2716.0,6,2017,458,0.0,True,False,False,False,False,True,143.6,70.9,7.7
2,64,6,189.0,6.2,1,True,3500.0,8,2018,529,0.0,False,True,False,True,False,False,158.1,73.8,8.5
3,128,6,189.0,6.2,1,True,3500.0,8,2018,529,1.0,False,True,False,True,False,False,158.1,73.8,8.5
4,256,6,189.0,6.2,1,True,3500.0,8,2018,529,2.0,False,True,False,True,False,False,158.1,73.8,8.5


In [30]:
df.isnull().sum()

Storage (GB)      0
RAM (GB)          0
Weight (g)        0
Display Size      0
OS                0
NFC               0
Battery (mAh)    48
CPU               0
Year              0
PPI Density       0
Price Range      15
Apple             0
Samsung           0
Xiaomi            0
AMOLED            0
LCD               0
OLED              0
Height            0
Width             0
Thickness         0
dtype: int64

In [31]:
df["Battery (mAh)"].unique()

array([2716., 3500., 3000., 3174., 2658., 2942.,   nan, 3100., 3400.,
       4100., 4300., 3110., 3046., 3969., 4000., 4500., 4780., 5000.,
       1821., 5020., 5260., 4700., 5160., 2815., 2227., 3687., 6000.,
       4600., 4800., 4520., 4250., 5100., 4400., 3300., 2438., 4352.,
       3240., 3095., 3700., 2018., 5080., 3279., 3200., 4323., 4820.,
       3900., 3349., 4441., 4383., 3274., 8000., 4610., 5030., 3582.,
       4685., 3561., 4674.])

In [32]:
cols = ["Battery (mAh)", "Display Size", "CPU", "Apple", "Height", "Thickness", "OLED", "Android", "iOS"]

In [33]:
iterative_imputer = IterativeImputer()

In [36]:
df["OS"] = df["OS"].map({0: "iOS", 1: "Android"})

In [37]:
df = pd.concat([df, pd.get_dummies(df["OS"])], axis=1)
df = df.drop(columns=["OS"])

In [38]:
df.head()

Unnamed: 0,Storage (GB),RAM (GB),Weight (g),Display Size,NFC,Battery (mAh),CPU,Year,PPI Density,Price Range,...,Samsung,Xiaomi,AMOLED,LCD,OLED,Height,Width,Thickness,Android,iOS
0,256,3,174.0,5.8,True,2716.0,6,2017,458,2.0,...,False,False,False,False,True,143.6,70.9,7.7,False,True
1,256,3,174.0,5.8,True,2716.0,6,2017,458,0.0,...,False,False,False,False,True,143.6,70.9,7.7,False,True
2,64,6,189.0,6.2,True,3500.0,8,2018,529,0.0,...,True,False,True,False,False,158.1,73.8,8.5,True,False
3,128,6,189.0,6.2,True,3500.0,8,2018,529,1.0,...,True,False,True,False,False,158.1,73.8,8.5,True,False
4,256,6,189.0,6.2,True,3500.0,8,2018,529,2.0,...,True,False,True,False,False,158.1,73.8,8.5,True,False


In [39]:
df[cols] = iterative_imputer.fit_transform(df[cols])

In [40]:
df = df.dropna(subset=["Price Range"], axis=0)

In [41]:
df.isnull().sum()

Storage (GB)     0
RAM (GB)         0
Weight (g)       0
Display Size     0
NFC              0
Battery (mAh)    0
CPU              0
Year             0
PPI Density      0
Price Range      0
Apple            0
Samsung          0
Xiaomi           0
AMOLED           0
LCD              0
OLED             0
Height           0
Width            0
Thickness        0
Android          0
iOS              0
dtype: int64

In [42]:
X, y = df.drop(columns=["Price Range"]), df["Price Range"]

In [43]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2)

In [44]:
scaler = MinMaxScaler()

In [45]:
train_X = scaler.fit_transform(train_X)

In [46]:
test_X = scaler.transform(test_X)

In [47]:
classifier = LogisticRegression().fit(train_X, train_y)

In [48]:
pred_y = classifier.predict(test_X)

In [49]:
print(classification_report(test_y, pred_y))

              precision    recall  f1-score   support

         0.0       0.54      0.29      0.38        24
         1.0       0.83      0.92      0.87       159
         2.0       0.36      0.24      0.29        17

    accuracy                           0.79       200
   macro avg       0.58      0.48      0.51       200
weighted avg       0.76      0.79      0.76       200



In [50]:
classifier = DecisionTreeClassifier().fit(train_X, train_y)

In [51]:
classifier_lr = LogisticRegression().fit(train_X, train_y)
pred_y_lr = classifier_lr.predict(test_X)

In [54]:
print("=== Logistic Regression ===")
print(f"Accuracy: {accuracy_score(test_y, pred_y_lr):.4f}")
print(f"Precision: {precision_score(test_y, pred_y_lr, average='weighted'):.4f}")
print(f"Recall: {recall_score(test_y, pred_y_lr, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(test_y, pred_y_lr, average='weighted'):.4f}")
print(classification_report(test_y, pred_y_lr))

=== Logistic Regression ===
Accuracy: 0.7850
Precision: 0.7550
Recall: 0.7850
F1-Score: 0.7626
              precision    recall  f1-score   support

         0.0       0.54      0.29      0.38        24
         1.0       0.83      0.92      0.87       159
         2.0       0.36      0.24      0.29        17

    accuracy                           0.79       200
   macro avg       0.58      0.48      0.51       200
weighted avg       0.76      0.79      0.76       200



In [55]:
classifier_dt = DecisionTreeClassifier().fit(train_X, train_y)
pred_y_dt = classifier_dt.predict(test_X)

In [56]:
print("\n=== Decision Tree ===")
print(f"Accuracy: {accuracy_score(test_y, pred_y_dt):.4f}")
print(f"Precision: {precision_score(test_y, pred_y_dt, average='weighted'):.4f}")
print(f"Recall: {recall_score(test_y, pred_y_dt, average='weighted'):.4f}")
print(f"F1-Score: {f1_score(test_y, pred_y_dt, average='weighted'):.4f}")
print(classification_report(test_y, pred_y_dt))


=== Decision Tree ===
Accuracy: 0.7750
Precision: 0.7814
Recall: 0.7750
F1-Score: 0.7780
              precision    recall  f1-score   support

         0.0       0.50      0.54      0.52        24
         1.0       0.87      0.86      0.86       159
         2.0       0.33      0.35      0.34        17

    accuracy                           0.78       200
   macro avg       0.57      0.58      0.58       200
weighted avg       0.78      0.78      0.78       200



In [57]:
results = pd.DataFrame({
    'Logistic Regression': [accuracy_score(test_y, pred_y_lr), f1_score(test_y, pred_y_lr, average='weighted')],
    'Decision Tree': [accuracy_score(test_y, pred_y_dt), f1_score(test_y, pred_y_dt, average='weighted')]
}, index=['Accuracy', 'F1-Score'])

print("\n=== Model Comparison ===")
print(results)
print(f"\nBest Model: {results.loc['F1-Score'].idxmax()}")


=== Model Comparison ===
          Logistic Regression  Decision Tree
Accuracy             0.785000       0.775000
F1-Score             0.762646       0.778019

Best Model: Decision Tree
