In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
url =  "https://drive.google.com/file/d/1pjDDlI4kJ75GLOOj_HkFqqeV8Of_c6H1/view?usp=share_link"
url='https://drive.google.com/uc?id=' + url.split('/')[-2]

In [3]:
data = pd.read_csv(url)

In [4]:
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   battery_power  2000 non-null   int64  
 1   blue           2000 non-null   int64  
 2   clock_speed    2000 non-null   float64
 3   dual_sim       2000 non-null   int64  
 4   fc             2000 non-null   int64  
 5   four_g         2000 non-null   int64  
 6   int_memory     2000 non-null   int64  
 7   m_dep          2000 non-null   float64
 8   mobile_wt      2000 non-null   int64  
 9   n_cores        2000 non-null   int64  
 10  pc             2000 non-null   int64  
 11  px_height      2000 non-null   int64  
 12  px_width       2000 non-null   int64  
 13  ram            2000 non-null   int64  
 14  sc_h           2000 non-null   int64  
 15  sc_w           2000 non-null   int64  
 16  talk_time      2000 non-null   int64  
 17  three_g        2000 non-null   int64  
 18  touch_sc

In [6]:
data.shape

(2000, 21)

# **1. Remove handle null values (if any).**

In [7]:
data_with_nonull_values = data.dropna()

In [8]:
data_with_nonull_values.shape

(2000, 21)

In [9]:
data_with_nonull_values.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [10]:
data_with_nonull_values.corr()['price_range']

battery_power    0.200723
blue             0.020573
clock_speed     -0.006606
dual_sim         0.017444
fc               0.021998
four_g           0.014772
int_memory       0.044435
m_dep            0.000853
mobile_wt       -0.030302
n_cores          0.004399
pc               0.033599
px_height        0.148858
px_width         0.165818
ram              0.917046
sc_h             0.022986
sc_w             0.038711
talk_time        0.021859
three_g          0.023611
touch_screen    -0.030411
wifi             0.018785
price_range      1.000000
Name: price_range, dtype: float64

## **2)Split data into training and test data**

In [11]:
y = data_with_nonull_values.iloc[:,-1].values

In [12]:
y.shape

(2000,)

In [13]:
X = data_with_nonull_values.drop(columns=['price_range']).iloc[:,:].values

In [14]:
X.shape

(2000, 20)

In [65]:
from sklearn.model_selection import train_test_split

In [16]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25)

## **Logistic Regression**

In [63]:
from sklearn.linear_model import LogisticRegression

In [64]:
model = LogisticRegression(solver='lbfgs', max_iter=1000, verbose = 1, n_jobs=20)

In [19]:
model.fit(X_train,y_train)

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done   1 out of   1 | elapsed:    6.3s finished


LogisticRegression(max_iter=1000, n_jobs=20, verbose=1)

In [20]:
y_pred = model.predict(X_test)

In [21]:
from  sklearn.metrics import accuracy_score

In [22]:
accuracy_score(y_test,y_pred)

0.7740458015267175

In [23]:
from sklearn.metrics import confusion_matrix

In [24]:
cnf_matrix = confusion_matrix(y_test, y_pred)

In [25]:
cnf_matrix

array([[110,  19,   2,   0],
       [ 21,  77,  23,   1],
       [  0,  19,  63,  31],
       [  0,   0,  23, 111]])

In [26]:
from sklearn.metrics import classification_report

In [27]:
target_names = ['(low cost', 'medium cost', 'high cost', 'very high cost']

In [28]:
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

     (low cost       0.84      0.84      0.84       131
   medium cost       0.67      0.63      0.65       122
     high cost       0.57      0.56      0.56       113
very high cost       0.78      0.83      0.80       134

      accuracy                           0.72       500
     macro avg       0.71      0.71      0.71       500
  weighted avg       0.72      0.72      0.72       500



# **KNN Classification**

In [29]:
from sklearn.neighbors import KNeighborsClassifier

In [30]:
np.sqrt(X_train.shape[0])

38.72983346207417

In [31]:
#we know that k=sqrt(No. of data in training set)
#So,here we have used k as 39 because in previous we have calculated the square root of the number of data in training set
# We also converted the 38.72983346207417 to integer 39 because K must be always an integer

k = 39

In [32]:
knn = KNeighborsClassifier(n_neighbors=k)

In [33]:
knn.fit(X_train,y_train)

KNeighborsClassifier(n_neighbors=39)

In [34]:
y_pred = knn.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score

In [36]:
accuracy_score(y_test,y_pred)

0.93

In [37]:
cnf_matrix = confusion_matrix(y_test, y_pred)

In [38]:
cnf_matrix

array([[128,   3,   0,   0],
       [  8, 111,   3,   0],
       [  0,   7, 103,   3],
       [  0,   0,  11, 123]])

In [39]:
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

     (low cost       0.94      0.98      0.96       131
   medium cost       0.92      0.91      0.91       122
     high cost       0.88      0.91      0.90       113
very high cost       0.98      0.92      0.95       134

      accuracy                           0.93       500
     macro avg       0.93      0.93      0.93       500
  weighted avg       0.93      0.93      0.93       500



## **SVM Classification**

In [40]:
from sklearn.svm import SVC

**linear kernel**

In [45]:
svc = SVC(kernel='linear')

In [46]:
svc.fit(X_train,y_train)

SVC(kernel='linear')

In [47]:
y_pred= svc.predict(X_test)

In [48]:
accuracy_score(y_test,y_pred)

0.98

In [49]:
cnf_matrix = confusion_matrix(y_test, y_pred)

In [50]:
cnf_matrix

array([[130,   1,   0,   0],
       [  0, 119,   3,   0],
       [  0,   0, 108,   5],
       [  0,   0,   1, 133]])

In [51]:
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

     (low cost       1.00      0.99      1.00       131
   medium cost       0.99      0.98      0.98       122
     high cost       0.96      0.96      0.96       113
very high cost       0.96      0.99      0.98       134

      accuracy                           0.98       500
     macro avg       0.98      0.98      0.98       500
  weighted avg       0.98      0.98      0.98       500



**RBF kernel**

In [56]:
svc = SVC(kernel= 'rbf', C=100.0)

In [57]:
svc.fit(X_train,y_train)

SVC(C=100.0)

In [58]:
y_pred= svc.predict(X_test)

In [59]:
accuracy_score(y_test,y_pred)

0.982

In [60]:
cnf_matrix = confusion_matrix(y_test, y_pred)

In [61]:
cnf_matrix

array([[130,   1,   0,   0],
       [  0, 120,   2,   0],
       [  0,   2, 109,   2],
       [  0,   0,   2, 132]])

In [62]:
print(classification_report(y_test, y_pred, target_names=target_names))

                precision    recall  f1-score   support

     (low cost       1.00      0.99      1.00       131
   medium cost       0.98      0.98      0.98       122
     high cost       0.96      0.96      0.96       113
very high cost       0.99      0.99      0.99       134

      accuracy                           0.98       500
     macro avg       0.98      0.98      0.98       500
  weighted avg       0.98      0.98      0.98       500

