In [16]:
import pandas as pd
import numpy as np

# 数据读取及预处理

In [17]:
#读取数据
data = pd.read_csv(r'D:\mobile_price_predict\train.csv')
print(data.shape)
data.head()

(2000, 21)


Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1


In [18]:
labels = data['price_range']
data = data[data.columns[:20]]
#为消除量纲影响，归一化处理
data = (data - data.min()) / (data.max() - data.min())
data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi
0,0.227789,0.0,0.68,0.0,0.052632,0.0,0.080645,0.555556,0.9,0.142857,0.1,0.010204,0.170895,0.612774,0.285714,0.388889,0.944444,0.0,0.0,1.0
1,0.347361,1.0,0.0,1.0,0.0,1.0,0.822581,0.666667,0.466667,0.285714,0.3,0.461735,0.993324,0.634687,0.857143,0.166667,0.277778,1.0,1.0,0.0
2,0.041416,1.0,0.0,1.0,0.105263,1.0,0.629032,0.888889,0.541667,0.571429,0.3,0.644388,0.811749,0.627205,0.428571,0.111111,0.388889,1.0,1.0,0.0
3,0.076152,1.0,0.8,0.0,0.0,0.0,0.129032,0.777778,0.425,0.714286,0.45,0.620408,0.858478,0.671566,0.785714,0.444444,0.5,1.0,0.0,0.0
4,0.881764,1.0,0.28,0.0,0.684211,1.0,0.677419,0.555556,0.508333,0.142857,0.7,0.616327,0.4753,0.308658,0.214286,0.111111,0.722222,1.0,1.0,0.0


In [19]:
#随机七三划分数据集为训练集、测试集
np.random.seed(0)
data = np.array(data)
n = data.shape[0]
labels = np.array(labels)
labels = (labels >= 2).astype(int)
rand_index = np.random.permutation(n)
train_index = rand_index[:int(0.7 * n)]
test_index = rand_index[int(0.7 * n):]
train_data = data[train_index,:]
train_labels = labels[train_index]
test_data = data[test_index,:]
test_labels = labels[test_index]

# 模型定义

In [20]:
def sigmoid(x):
    return (np.exp(x) / (1 + np.exp(x)))


def binary_entropy(y,ypred):
    entropy = -(y * np.log(ypred) + (1-y) * np.log(1 - ypred)).mean()
    return entropy

In [28]:
class LR():
    def __init__(self, data, labels, iterations, learning_rate=1e-1):
        '''
        #learning_rate:学习率
        data,labels,训练数据集，标签
        iterations:训练迭代次数
        w,b：参数
        '''
        self.learning_rate = learning_rate
        self.data = data
        self.labels = labels.reshape(self.data.shape[0],1)
        self.iterations = iterations
        self.w = np.random.randn(data.shape[1],1) * 0.01
        self.b = 0
        
    def train(self):
        for i in range(self.iterations):
            ypred = sigmoid(np.dot(self.data,self.w) + self.b)
            loss = binary_entropy(self.labels,ypred)
            dw = np.dot(self.data.T, ypred-self.labels) / data.shape[0]
            db = (ypred-self.labels).mean()
            self.w -= self.learning_rate * dw
            self.b -= self.learning_rate * db
            if i % 2000 == 1999:
                print('Iterations:%d,Loss:%f' %(i+1, loss))
            
    def predict(self,x):
        ypred = sigmoid(np.dot(x, self.w) + self.b)
        if ypred >= 0.5:
            return 1
        else:
            return 0
        
    def test(self,data,labels):
        ypred = sigmoid(np.dot(data, self.w) + self.b)
        ypred = (ypred >= 0.5).astype(int)
        correct = (ypred == labels.reshape(data.shape[0],1)).sum()
        num_data = data.shape[0]
        print('测试集正确率: %f %%' %(correct * 100/ num_data))
        
        

# 模型评估

In [29]:
train_data = np.array(train_data)
train_labels = np.array(train_labels)
num_iterations = 20000
model = LR(train_data,train_labels,num_iterations)
model.train()

model.test(test_data,test_labels)

Iterations:2000,Loss:0.258565
Iterations:4000,Loss:0.200004
Iterations:6000,Loss:0.172559
Iterations:8000,Loss:0.155533
Iterations:10000,Loss:0.143558
Iterations:12000,Loss:0.134504
Iterations:14000,Loss:0.127326
Iterations:16000,Loss:0.121440
Iterations:18000,Loss:0.116491
Iterations:20000,Loss:0.112247
测试集正确率: 98.666667 %
