## Import 3rd party dependencies

In [88]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.io as pio
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split

## Load training data

In [89]:
data = pd.read_csv("archive/train.csv")

## Data Preprocessing

#### Convert labels into to two classes: low (0, 1) and high (2, 3)

In [90]:
data["price_classification"] = np.where(data["price_range"] <= 1, 0, 1)

#### Split the original ‘train.csv’ into ‘train.csv’, ‘valid.csv’ and ‘test.csv’ with the ratio of 0.8 : 0.1 : 0.1

In [91]:
x = data
y = data["price_range"]
x_train, x_tmp, y_train, y_tmp = train_test_split(x, y, test_size = 0.2, random_state = 101, stratify = y)
x_test, x_valid, y_test, y_valid = train_test_split(x_tmp, y_tmp, test_size = 0.5, random_state = 101, stratify = y_tmp)

#### Test whether the split workd correctly

In [92]:
print(x_train.shape)
print(x_test.shape)
print(x_valid.shape)

(1600, 22)
(200, 22)
(200, 22)


#### Write the data back to the csv file

In [93]:
x_train.to_csv("train.csv")
x_test.to_csv("test.csv")
x_valid.to_csv("valid.csv")

## Model Implementation

### The Sigmoid function

In [94]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

### Logistic Regression

In [95]:
class LogisticRegression():
    def __init__(self, learning_rate=.1, n_iterations=6000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations

    def initialize_weights(self, n_features):
        limit = np.sqrt(1 / n_features)
        w = np.random.uniform(-limit, limit, (n_features, 1))
        b = 0
        # Insert 0 as w_0
        self.w = np.insert(w, 0, b, axis=0)

    def fit(self, X, y):
        normal_X = normalize(X, norm = 'max')
        m_samples, n_features = normal_X.shape
        self.initialize_weights(n_features)
        # Insert a column of 1 as x_0
        normal_X = np.insert(normal_X, 0, 1, axis=1)
        y = np.reshape(y, (m_samples, 1))
        for i in range(self.n_iterations):
            h_x = normal_X.dot(self.w)
            y_pred = sigmoid(h_x)
            w_grad = normal_X.T.dot(y_pred - y)
            self.w = self.w - self.learning_rate * w_grad
        return self.w

    def predict(self, X):
        normal_X = normalize(X, norm = 'max')
        normal_X = np.insert(normal_X, 0, 1, axis=1)
        h_x = normal_X.dot(self.w)
        y_pred = np.round(sigmoid(h_x))
        return y_pred.astype(int)

    def test(self, X, y):
        normal_X = normalize(X, norm = 'max')
        m_samples = normal_X.shape[0]
        normal_X = np.insert(normal_X, 0, 1, axis=1)
        h_x = normal_X.dot(self.w)
        y_pred = np.round(sigmoid(h_x))
        right_count = 0
        for i in range(m_samples):
            if y_pred[i] == y[i]:
                right_count += 1
        return right_count / m_samples 

### Naive Bayes

In [96]:
class NaiveBayes():
    def fit(self, X, y):
        self.X = X
        self.y = y
        self.classes = np.unique(y)
        self.parameters = {}
        for i, c in enumerate(self.classes):
            # Calculate the mean value, variance and prior probability of every class
            X_Index_c = X[np.where(y == c)]
            X_index_c_mean = np.mean(X_Index_c, axis=0, keepdims=True)
            X_index_c_var = np.var(X_Index_c, axis=0, keepdims=True)
            parameters = {"mean": X_index_c_mean, "var": X_index_c_var, "prior": X_Index_c.shape[0] / X.shape[0]}
            self.parameters["class" + str(c)] = parameters

    def _pdf(self, X, classes):
        # 一维高斯分布的概率密度函数
        # eps为防止分母为0
        eps = 1e-4
        mean = self.parameters["class" + str(classes)]["mean"]
        var = self.parameters["class" + str(classes)]["var"]

        # 取对数防止数值溢出
        # numerator.shape = [m_sample,feature]
        numerator = np.exp(-(X - mean) ** 2 / (2 * var + eps))
        denominator = np.sqrt(2 * np.pi * var + eps)

        # 朴素贝叶斯假设(每个特征之间相互独立)
        # P(x1,x2,x3|Y) = P(x1|Y)*P(x2|Y)*P(x3|Y),取对数相乘变为相加
        # result.shape = [m_sample,1]
        result = np.sum(np.log(numerator / denominator), axis=1, keepdims=True)

        return result.T

    def _predict(self, X):
        # 计算每个种类的概率P(Y|x1,x2,x3) =  P(Y)*P(x1|Y)*P(x2|Y)*P(x3|Y)
        output = []
        for y in range(self.classes.shape[0]):
            prior = np.log(self.parameters["class" + str(y)]["prior"])
            posterior = self._pdf(X, y)
            prediction = prior + posterior
            output.append(prediction)
        return output

    def predict(self, X):
        # 取概率最大的类别返回预测值
        output = self._predict(X)
        output = np.reshape(output, (self.classes.shape[0], X.shape[0]))
        prediction = np.argmax(output, axis=0)
        return prediction

## Train models

### Decide what fields we want to process

In [97]:
output_param_name = 'price_classification'
price_range = 'price_range'

### Split training set and test set into input and output

In [98]:
x_train = train_data.drop([output_param_name],axis=1,inplace=False).drop([price_range],axis=1,inplace=False).values
y_train = train_data[[output_param_name]].values
x_test = test_data.drop([output_param_name],axis=1,inplace=False).drop([price_range],axis=1,inplace=False).values
y_test = test_data[[output_param_name]].values

#### Check whether the split works correctly

In [99]:
print(x_train)
print(y_train)
print(x_test)
print(y_test)

[[1.130e+03 1.000e+00 2.500e+00 ... 1.000e+00 0.000e+00 0.000e+00]
 [6.330e+02 1.000e+00 1.400e+00 ... 1.000e+00 1.000e+00 0.000e+00]
 [1.576e+03 1.000e+00 2.000e+00 ... 1.000e+00 0.000e+00 1.000e+00]
 ...
 [5.010e+02 0.000e+00 2.300e+00 ... 1.000e+00 0.000e+00 1.000e+00]
 [7.190e+02 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 1.000e+00]
 [6.150e+02 1.000e+00 5.000e-01 ... 1.000e+00 0.000e+00 0.000e+00]]
[[0]
 [0]
 [0]
 ...
 [0]
 [1]
 [0]]
[[6.150e+02 1.000e+00 2.500e+00 ... 1.000e+00 0.000e+00 0.000e+00]
 [1.821e+03 1.000e+00 1.200e+00 ... 1.000e+00 1.000e+00 0.000e+00]
 [9.560e+02 0.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]
 ...
 [1.483e+03 1.000e+00 2.200e+00 ... 1.000e+00 0.000e+00 0.000e+00]
 [1.467e+03 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 1.000e+00]
 [7.940e+02 1.000e+00 5.000e-01 ... 1.000e+00 1.000e+00 0.000e+00]]
[[1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [1]
 [1]
 [1]
 [0]
 [1]
 [1]
 [1]
 [1]
 [0]
 [1]
 [0]
 [0]
 [1

### Train Logistic Regression Model

In [100]:
logistic_regression = LogisticRegression()
theta = logistic_regression.fit(x_train, y_train)

#### Print model parameters

In [101]:
theta_table = pd.DataFrame({'Model Parameters': theta.flatten()})
print(theta_table)

    Model Parameters
0        -144.631006
1          12.762937
2          -9.087355
3         -23.561154
4          -9.748242
5         -54.191296
6          -8.195112
7        -474.453166
8          -9.731858
9       -1244.062972
10        -70.876019
11       -146.039302
12         48.570463
13        -28.216827
14        283.735354
15       -200.736174
16        -70.411975
17       -209.759570
18        -10.596741
19         -9.785571
20         -8.690528


## Test models

### Test Logistic Regression Model

In [102]:
accuracy_result = logistic_regression.test(x_test, y_test)
print('Accuracy: {:.2f}' .format(accuracy_result))

Accuracy: 0.83
