## Import Libraries and Data


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

train = pd.read_csv("data_train.csv")
valid = pd.read_csv("data_validation.csv")

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# Defining Column Groups

# Getting all the column names
cols = train.columns.values.tolist()

# Columns that are variable to determine target
var_cols = cols[:-1]

# Columns which contain boolean value (only 0 / 1)
bool_cols = ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']

num_cols = var_cols.copy()
for col in bool_cols:
  num_cols.remove(col)

print("All column names: ", cols)
print("Variable column: ", var_cols)
print()
print("Boolean contained column: ", bool_cols)
print("Numerical contained column: ", num_cols)

All column names:  ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi', 'price_range']
Variable column:  ['battery_power', 'blue', 'clock_speed', 'dual_sim', 'fc', 'four_g', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time', 'three_g', 'touch_screen', 'wifi']

Boolean contained column:  ['blue', 'dual_sim', 'four_g', 'three_g', 'touch_screen', 'wifi']
Numerical contained column:  ['battery_power', 'clock_speed', 'fc', 'int_memory', 'm_dep', 'mobile_wt', 'n_cores', 'pc', 'px_height', 'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time']


## Pre-Processing

In [3]:
# Drop Duplicate Value
train = train.drop_duplicates(subset=var_cols, keep="first")
valid = valid.drop_duplicates(subset=var_cols, keep="first")

In [4]:
# Drop Missing Value
train = train.dropna()
train_not_normalized = pd.DataFrame(train)
valid = valid.dropna()
valid_not_normalized = pd.DataFrame(valid)

In [5]:
# Drop Outliers Data
def getOutlierRows(df):
  threshold = 1.5
  outliers_res = []

  for col in num_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    IQR = q3 - q1

    outliers = df[(df[col] < q1 - threshold * IQR) | (df[col] > q3 + threshold * IQR)]
    for i in (outliers.index.tolist()):
      if i not in outliers_res:
        outliers_res.append(i)

  return outliers_res


train_outliers = getOutlierRows(train)
valid_outliers = getOutlierRows(valid)

train = train.drop(train_outliers)
valid = valid.drop(valid_outliers)

In [6]:
# Split data variables and target

train_vars = train[var_cols]
train_target = train['price_range']

valid_vars = valid[var_cols]
valid_target = valid['price_range']


In [7]:
# Test for normal distribution using saphiro wilk for numerical columns
from scipy import stats

def displaySaphiroTest(column):
  # If p-value < 0.05 -> reject null hypothesis -> data is not normally distributed
  res = stats.shapiro(column)
  print("Statistics: ", res.statistic)
  print("P-value: ", res.pvalue)
  print()

for i in num_cols:
  print('Saphiro Wilk Test | Column', i)
  displaySaphiroTest(train[i])

Saphiro Wilk Test | Column battery_power
Statistics:  0.9574270844459534
P-value:  1.060794641360873e-19

Saphiro Wilk Test | Column clock_speed
Statistics:  0.9102329015731812
P-value:  7.294491923693224e-28

Saphiro Wilk Test | Column fc
Statistics:  0.8755258321762085
P-value:  6.4722148841515e-32

Saphiro Wilk Test | Column int_memory
Statistics:  0.9518002867698669
P-value:  5.913443517204487e-21

Saphiro Wilk Test | Column m_dep
Statistics:  0.9267380833625793
P-value:  1.7198614800541072e-25

Saphiro Wilk Test | Column mobile_wt
Statistics:  0.9530051350593567
P-value:  1.073764526404149e-20

Saphiro Wilk Test | Column n_cores
Statistics:  0.9263514280319214
P-value:  1.4977612667151081e-25

Saphiro Wilk Test | Column pc
Statistics:  0.9522290825843811
P-value:  7.302858549771738e-21

Saphiro Wilk Test | Column px_height
Statistics:  0.946509063243866
P-value:  4.871374366791771e-22

Saphiro Wilk Test | Column px_width
Statistics:  0.9575198292732239
P-value:  1.1150424384018077

## Implementation 1 : **KNN Algorithm**

### From Scratch Development

In [10]:
import numpy as np

# Menghitung jarak antara kedua data dengan menggunakan euclidean distance
def euclidean_distance(trainX, test):
    train_array = trainX.to_numpy()
    test_array = test.to_numpy()
    total_distance = 0
    for i in range(len(test_array)):
      distance = (train_array[i] - test_array[i])**2
      total_distance += distance
    return np.sqrt(total_distance)

# Menghitung jarak antara kedua data dengan menggunakan manhattan distance
def manhattan_distance(trainX,test):
  train_array = trainX.to_numpy()
  test_array = test.to_numpy()
  total_distance = 0
  for i in range(len(test_array)):
    distance = np.abs(train_array[i] - test_array[i])
    total_distance += distance
  return total_distance

# Menghitung jarak antar kedua data dengan
def minkowski_distance(trainX, test, p=2):
  return np.sum(np.abs(trainX-test)**p)**(1/p)


In [11]:
def calculate_distance(method, trainX, target):
  distances = []
  for i in range(len(trainX)):
      current_row = trainX.iloc[i]
      if(method =="euclidean"):
        distance = euclidean_distance(current_row, target)
      elif (method =="manhattan"):
        distance = manhattan_distance(current_row, target)
      elif (method =="minkowski"):
        distance = minkowski_distance(current_row, target)
      distances.append(distance)
  return distances

In [12]:
def get_majority(dataset):
  return dataset["price_range"].mode()[0]

In [13]:
def get_prediction(trainX, validY, k, method):
  train_without_label = trainX.copy()
  train_without_label.drop("price_range",axis=1)
  distances = calculate_distance(method,train_without_label,validY)
  final_result = trainX.copy()
  final_result["distance"] = distances
  # print(final_result)
  sorted_result = final_result.sort_values(by="distance", ascending = True)
  # print(sorted_result[:k])
  k_neighbors = sorted_result[:k]
  result = get_majority(k_neighbors)
  # print(result)
  return result


In [14]:
def k_neighbors(trainX, validY,k, method):
  result = validY.copy()
  valid_without_label = validY.copy()
  valid_without_label.drop("price_range",axis=1)
  predictions = []
  for i in range(len(valid_without_label)):
    current = valid_without_label.iloc[i]
    prediction = get_prediction(trainX, current, k, method)
    # print("INi prediction : ",prediction)
    predictions.append(prediction)
  result["predictions"] = predictions
  return result

In [15]:
x = train_not_normalized.copy()
y = valid_not_normalized.copy()
# Feature Selection
# x = x[['battery_power', 'int_memory', 'mobile_wt', 'px_height', 'px_width', 'ram','sc_h', 'sc_w', 'three_g','price_range']]
# y = y[['battery_power', 'int_memory', 'mobile_wt', 'px_height', 'px_width', 'ram','sc_h', 'sc_w', 'three_g','price_range']]

final_result = k_neighbors(x,y,19, "euclidean")
diff = 0
check = []
for i in range(len(final_result)):
  if(final_result.iloc[i]["predictions"] != final_result.iloc[i]["price_range"]):
    check.append(i)
    diff+=1
# print(diff)
accuracy = (len(final_result)-diff)/len(final_result) * 100
print(accuracy)

93.83333333333333


#### Train Data for Kaggle Test

In [16]:
test = pd.read_csv("test.csv")
initial_train = pd.concat([train_not_normalized, valid_not_normalized], ignore_index=True)
test["price_range"] = 0
test_no_id = test.drop("id", axis=1)
test_id = test["id"]

# Feature Selection
# initial_train = initial_train[['battery_power', 'int_memory', 'mobile_wt', 'px_height', 'px_width', 'ram','sc_h', 'sc_w', 'three_g','price_range']]
# test_no_id = test_no_id[['battery_power', 'int_memory', 'mobile_wt', 'px_height', 'px_width', 'ram','sc_h', 'sc_w', 'three_g','price_range']]

res = k_neighbors(initial_train,test_no_id,7, "euclidean")

result_df = pd.concat([test_id, res["predictions"]], axis=1)
# Set column names
result_df.columns = ["id", "price_range"]

# Save the result to a CSV file
result_df.to_csv("result.csv", index=False)

### Libraries Usage Development

In [24]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.metrics import classification_report, confusion_matrix

knn19 = KNeighborsClassifier(n_neighbors = 19)
knn1 = KNeighborsClassifier(n_neighbors=1)

X_train = train_not_normalized.drop("price_range",axis=1)
y_train = train_not_normalized["price_range"]
X_test = valid_not_normalized.drop("price_range",axis=1)
y_test = valid_not_normalized["price_range"]

selector = SelectKBest(k="all")
X_train = selector.fit_transform(X_train, y_train)
X_test = selector.transform(X_test)

# Fit the KNN models on the training data
knn19.fit(X_train, y_train)
knn1.fit(X_train, y_train)

# Predictions on the validation set
preds_knn19 = knn19.predict(X_test)
preds_knn1 = knn1.predict(X_test)

# Calculate accuracy scores
accuracy_knn19 = knn19.score(X_test, y_test)
accuracy_knn1 = knn1.score(X_test, y_test)

# Print the accuracy scores
print("Accuracy for KNN (k=19):", accuracy_knn19)
print("Accuracy for KNN (k=1):", accuracy_knn1)

print(classification_report(y_test,preds_knn19))
print(confusion_matrix(y_test,preds_knn19))

Accuracy for KNN (k=19): 0.9383333333333334
Accuracy for KNN (k=1): 0.9183333333333333
              precision    recall  f1-score   support

           0       0.96      0.97      0.97       142
           1       0.91      0.93      0.92       144
           2       0.91      0.92      0.91       155
           3       0.99      0.93      0.96       159

    accuracy                           0.94       600
   macro avg       0.94      0.94      0.94       600
weighted avg       0.94      0.94      0.94       600

[[138   4   0   0]
 [  6 134   4   0]
 [  0  10 143   2]
 [  0   0  11 148]]


## Implementation 2 : **Naive Bayes**

### From Scratch Development

In [18]:
# Seperate data by target value, make it as a dictionary

def seperateByTarget(df) -> dict:
  container = dict()

  count = df['price_range'].value_counts()

  indexList = count.index.tolist()
  countList = count.tolist()

  for i in range(len(indexList)):
    container[indexList[i]] = countList[i]

  return container


targetDict = seperateByTarget(train)
for i in (targetDict):
  print(i, ": ", targetDict[i])

0 :  357
1 :  352
2 :  343
3 :  337


In [19]:
# Make statistics dictionary data for all columns
# are only used for numeric data

def makeColumnsStatisticsDictionary(df):
  dictionary = dict()
  for col in num_cols:
    dictionary[col] = dict()
    smallerDf = df[[col, 'price_range']]
    mean = smallerDf.groupby(['price_range'])[col].mean()
    std = smallerDf.groupby(['price_range'])[col].std()

    indexList = mean.index.tolist()
    meanList = mean.values.tolist()
    stdList = std.values.tolist()

    for i in range(len(indexList)):
      dictionary[col][indexList[i]] = dict()
      dictionary[col][indexList[i]]['mean'] = meanList[i]
      dictionary[col][indexList[i]]['std'] = stdList[i]
  return dictionary

statisticsDict = makeColumnsStatisticsDictionary(train)


# # For display
for columnDict in statisticsDict:
  print(columnDict)
  for targetVal in statisticsDict[columnDict]:
    print("  ", targetVal)
    for data in statisticsDict[columnDict][targetVal]:
      print("    ", data,":", statisticsDict[columnDict][targetVal][data])


battery_power
   0
     mean : 1122.6218487394958
     std : 408.7939508363507
   1
     mean : 1237.7755681818182
     std : 425.2187882289822
   2
     mean : 1220.597667638484
     std : 441.2731728690964
   3
     mean : 1367.394658753709
     std : 409.85361247872686
clock_speed
   0
     mean : 1.5137254901960784
     std : 0.8408174612152033
   1
     mean : 1.5059659090909092
     std : 0.8046472419760622
   2
     mean : 1.5262390670553936
     std : 0.8107895372252784
   3
     mean : 1.538872403560831
     std : 0.8095794975401495
fc
   0
     mean : 4.221288515406163
     std : 4.141974169458039
   1
     mean : 4.153409090909091
     std : 4.351937679282823
   2
     mean : 4.244897959183674
     std : 3.9990600399353258
   3
     mean : 4.0474777448071215
     std : 4.186585847653323
int_memory
   0
     mean : 31.77310924369748
     std : 18.304894495638568
   1
     mean : 32.0625
     std : 17.354739001551305
   2
     mean : 29.87463556851312
     std : 18.58689166856

In [20]:
# Make count
# are only used for boolean data

def makeColumnProbabilityDictionary(df, targetDict):
  dictionary = dict()
  for col in bool_cols:
    dictionary[col] = dict()
    smallerDf = df[[col, 'price_range']]
    temp = smallerDf.groupby([ col, 'price_range'])['price_range'].count()

    indexList = temp.index.tolist()
    valuesList = temp.values.tolist()
    length = len(indexList)

    currVal = None
    prevVal = None
    for i in range(length):
      colValue = indexList[i][0]
      targetValue = indexList[i][1]
      currVal = colValue
      if (currVal != prevVal):
        dictionary[col][colValue] = dict()
      dictionary[col][colValue][targetValue] = valuesList[i]
      prevVal = currVal

  return dictionary


countDict = makeColumnProbabilityDictionary(train, targetDict)


# # For display
for col in countDict:
  print(col)
  for colVal in countDict[col]:
    print("  ", colVal,":")
    for targetVal in countDict[col][colVal]:
      print("    ", targetVal, ":", countDict[col][colVal][targetVal])

blue
   0 :
     0 : 188
     1 : 186
     2 : 171
     3 : 161
   1 :
     0 : 169
     1 : 166
     2 : 172
     3 : 176
dual_sim
   0 :
     0 : 171
     1 : 175
     2 : 183
     3 : 162
   1 :
     0 : 186
     1 : 177
     2 : 160
     3 : 175
four_g
   0 :
     0 : 167
     1 : 162
     2 : 173
     3 : 151
   1 :
     0 : 190
     1 : 190
     2 : 170
     3 : 186
three_g
   0 :
     0 : 99
     1 : 74
     2 : 81
     3 : 79
   1 :
     0 : 258
     1 : 278
     2 : 262
     3 : 258
touch_screen
   0 :
     0 : 178
     1 : 170
     2 : 185
     3 : 179
   1 :
     0 : 179
     1 : 182
     2 : 158
     3 : 158
wifi
   0 :
     0 : 185
     1 : 180
     2 : 181
     3 : 156
   1 :
     0 : 172
     1 : 172
     2 : 162
     3 : 181


In [21]:
import math
# Calculate probability
# For numerical column use normal distribution

def calculateNumerical(x, mean, std):
    leftpart = 1 / (std * math.sqrt(2 * math.pi))
    exppart = (-1/2) * (((x - mean)/std)**2)
    rightpart = math.exp(exppart)
    return leftpart * rightpart


### Execute

In [22]:
# Testing

def executeTraining(df):

  # Use Dictionary above
  statisticsDict = makeColumnsStatisticsDictionary(train)
  targetDict = seperateByTarget(train)
  countDict = makeColumnProbabilityDictionary(train, targetDict)

  # For printing
  error_count = 0
  indexList = df.index.tolist()
  actualTarget = df['price_range'].values.tolist()
  totalData = len(df)

  for i in range(totalData):
    rowData = df.iloc[i][:-1]
    prob = [1,1,1,1] #sorted as the index, for target val 0, 1, 2, 3

    # Probability of target column
    for k in range(len(prob)):
      prob[k] *= targetDict[k] / totalData

    # Probability of other dependent column
    for j in range(len(rowData)):
      column = var_cols[j]
      if (column in num_cols):
        for k in range(len(prob)):
          x = rowData[j]
          mean = statisticsDict[column][k]['mean']
          std = statisticsDict[column][k]['std']
          numProbability = calculateNumerical(x, mean, std)
          prob[k] = prob[k] * numProbability

      elif (column in bool_cols):
        for k in range(len(prob)):
          countColGivenTarget = countDict[column][rowData[j]][k]
          countTarget = targetDict[k]
          boolProbability = countColGivenTarget/countTarget
          prob[k] = prob[k] * boolProbability


    # print(indexList[i] , "  ", prob)
    max = 0
    for p in range(len(prob)):
      if (prob[p] > prob[max]):
        max = p

    print(indexList[i], " ", max, " ", actualTarget[i])
    if (max != actualTarget[i]):
      error_count +=1

  print("Error count: ", error_count)


executeTraining(valid)

0   2   1
1   2   2
2   3   3
3   0   0
4   3   3
5   1   1
6   3   3
7   0   0
8   3   3
9   1   2
10   3   3
11   2   2
12   3   3
13   0   0
14   3   3
15   0   0
16   2   2
17   1   1
18   0   1
19   2   2
20   3   3
21   1   2
22   0   0
23   1   1
24   1   2
25   1   0
26   2   3
27   2   1
28   0   0
29   2   3
30   2   1
31   3   3
32   3   3
33   0   0
34   2   2
35   3   3
36   2   1
37   3   3
38   1   2
39   1   1
40   0   1
41   3   2
42   0   0
43   2   2
44   2   3
46   1   1
47   2   2
48   1   2
49   3   3
50   1   2
51   2   2
52   3   3
53   0   0
54   1   1
55   3   3
56   2   1
57   3   3
58   3   3
59   2   2
60   2   2
61   3   3
62   3   3
63   1   1
64   3   3
65   2   2
66   2   3
67   2   2
69   3   3
70   2   2
71   3   3
72   1   1
73   0   0
74   1   1
75   2   2
76   0   0
77   3   3
78   1   1
79   0   0
80   3   3
81   3   3
82   0   1
83   2   2
84   3   3
85   1   2
86   3   3
87   3   3
88   0   0
89   2   2
90   1   1
91   1   1
92   1   1
93   2   

139   0   0
140   1   1
141   3   3
142   3   3
143   1   1
144   2   2
145   3   3
146   1   1
147   2   2
148   1   1
149   2   1
150   3   3
151   1   0
152   0   0
153   2   2
154   1   1
155   0   1
156   3   2
157   0   0
158   0   1
159   3   3
160   3   3
161   3   3
162   0   0
163   2   2
164   3   3
165   0   0
166   0   0
167   1   2
168   0   1
169   2   2
170   3   2
171   1   1
172   0   0
173   1   1
174   1   0
175   1   1
176   3   3
177   0   1
178   2   2
179   0   0
180   3   3
181   1   1
182   1   1
183   2   2
184   0   0
185   2   2
186   0   0
187   3   3
188   2   2
189   0   0
190   3   3
191   2   2
192   0   0
193   1   0
194   2   2
195   0   0
196   1   1
197   3   3
198   3   3
199   1   1
200   2   1
201   2   2
202   2   2
203   3   3
204   2   2
205   3   3
206   3   3
207   3   3
208   0   0
209   2   2
210   0   1
211   2   2
212   3   3
213   1   1
214   2   1
215   3   2
216   2   3
217   0   0
218   0   0
219   2   2
220   2   2
221   2   1
222 

### Libraries Usage Development

In [23]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics

model = GaussianNB()
X_train = train.iloc[:, :-1]
y_train = train.iloc[:, -1]
model.fit(X_train, y_train)
X_test = valid.iloc[:, :-1]
y_test = valid.iloc[:, -1]
y_pred = model.predict(X_test)

accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test,y_pred))

Accuracy: 0.7783417935702199
              precision    recall  f1-score   support

           0       0.86      0.87      0.87       139
           1       0.66      0.64      0.65       143
           2       0.69      0.71      0.70       153
           3       0.90      0.89      0.90       156

    accuracy                           0.78       591
   macro avg       0.78      0.78      0.78       591
weighted avg       0.78      0.78      0.78       591

