# 2：正则化逻辑回归
### **by:MLZZY**
**实验描述：加入正则项提升逻辑回归算法
数据集中有一些芯片在两次测试中的测试结果，测试结果决定是否芯片要被接受或抛弃。**

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.optimize as opt

In [2]:
path='/home/mw/input/andrew_ml_ex22391/ex2data2.txt'
data_init=pd.read_csv(path,header=None,names=['Test 1','Test 2','Accepted'])
data_init.head()

Unnamed: 0,Test 1,Test 2,Accepted
0,0.051267,0.69956,1
1,-0.092742,0.68494,1
2,-0.21371,0.69225,1
3,-0.375,0.50219,1
4,-0.51325,0.46564,1


In [4]:
positive2=data_init[data_init['Accepted'].isin([1])]
negative2 = data_init[data_init['Accepted'].isin([0])]
fig,ax=plt.subplots(figsize=(12,8))
ax.scatter(positive2['Test 1'], positive2['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['Test 1'], negative2['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.legend()
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
plt.show()

In [5]:
#通过上图发现此数据集不能用直线将两部分分割，而逻辑回归只适用于线性的分割
#一种更好的使用数据集的方式是为每组数据创造更多的特征。所以我们为每组x1,x2添加了最高到6次幂的特征(x1   x2   x1^2   x1*x2  x2^2 ... ...)
degree=6
data2=data_init
x1=data2['Test 1']
x2=data2['Test 2']
data2.insert(3,'Ones',1)
for i in range(1,degree+1):
    for j in range(0,i+1):
        data2['F'+str(i-j)+str(j)]=np.power(x1,i-j)*np.power(x2,j)
data2.drop('Test 1', axis=1, inplace=True)
data2.drop('Test 2', axis=1, inplace=True)
data2.head()

Unnamed: 0,Accepted,Ones,F10,F01,F20,F11,F02,F30,F21,F12,...,F23,F14,F05,F60,F51,F42,F33,F24,F15,F06
0,1,1,0.051267,0.69956,0.002628,0.035864,0.489384,0.000135,0.001839,0.025089,...,0.0009,0.012278,0.167542,1.81563e-08,2.477505e-07,3e-06,4.6e-05,0.000629,0.008589,0.117206
1,1,1,-0.092742,0.68494,0.008601,-0.063523,0.469143,-0.000798,0.005891,-0.043509,...,0.002764,-0.020412,0.150752,6.362953e-07,-4.699318e-06,3.5e-05,-0.000256,0.001893,-0.013981,0.103256
2,1,1,-0.21371,0.69225,0.045672,-0.147941,0.47921,-0.009761,0.031616,-0.102412,...,0.015151,-0.049077,0.15897,9.526844e-05,-0.0003085938,0.001,-0.003238,0.010488,-0.033973,0.110047
3,1,1,-0.375,0.50219,0.140625,-0.188321,0.252195,-0.052734,0.07062,-0.094573,...,0.01781,-0.023851,0.03194,0.002780914,-0.003724126,0.004987,-0.006679,0.008944,-0.011978,0.01604
4,1,1,-0.51325,0.46564,0.263426,-0.23899,0.216821,-0.135203,0.122661,-0.111283,...,0.026596,-0.024128,0.02189,0.0182799,-0.01658422,0.015046,-0.01365,0.012384,-0.011235,0.010193


In [10]:
def sigmoid(z):
    return 1/(1+np.exp(-z))

In [6]:
#正则化的代价函数
def costReg(theta,X,y,lamda):
    theta=np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (lamda / (2 * len(X))) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
    return np.sum(first - second) / len(X) + reg

In [7]:
# 正则化的梯度函数
def gradientReg(theta, X, y, lamda):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y) 
    parameters = int(theta.ravel().shape[1])
    grad = np.zeros(parameters)   
    error = sigmoid(X * theta.T) - y   
    for i in range(parameters):
        term = np.multiply(error, X[:,i])      
        if (i == 0):
            grad[i] = np.sum(term) / len(X)
        else:
            grad[i] = (np.sum(term) / len(X)) + ((lamda / len(X)) * theta[:,i])    
    return grad

In [8]:
# 初始化X，y，θ
cols = data2.shape[1]
X2 = data2.iloc[:,1:cols]
y2 = data2.iloc[:,0:1]
theta2 = np.zeros(cols-1)
X2 = np.array(X2.values)
y2 = np.array(y2.values)
# λ设为1
lamda = 1

In [11]:
# 计算初始代价
costReg(theta2, X2, y2, lamda)

0.6931471805599454

In [13]:
#调用库函数进行梯度下降求解theta
result2 = opt.fmin_tnc(func=costReg, x0=theta2, fprime=gradientReg, args=(X2, y2, lamda))
result2

(array([ 1.27271026,  0.62529965,  1.18111686, -2.01987399, -0.91743189,
        -1.43166928,  0.12393228, -0.36553118, -0.35725404, -0.17516291,
        -1.45817009, -0.05098418, -0.61558554, -0.27469165, -1.19271299,
        -0.2421784 , -0.20603299, -0.04466178, -0.27778951, -0.29539514,
        -0.45645982, -1.04319154,  0.02779373, -0.2924487 ,  0.0155576 ,
        -0.32742405, -0.1438915 , -0.92467488]), 32, 1)

In [14]:
def predict(theta,X):
    probability=sigmoid(X*theta.T)
    return [1 if x>=0.5 else 0 for x in probability]

In [19]:
#计算准确度
theta_min = np.matrix(result2[0])
predictions = predict(theta_min, X2)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y2)]
accuracy = sum(correct) / len(correct)
accuracy

0.8305084745762712

In [20]:
def hfunc2(theta, x1, x2):
    temp = theta[0][0]
    place = 0
    for i in range(1, degree+1):
        for j in range(0, i+1):
            temp+= np.power(x1, i-j) * np.power(x2, j) * theta[0][place+1]
            place+=1
    return temp

In [21]:
def find_decision_boundary(theta):
    t1 = np.linspace(-1, 1.5, 1000)
    t2 = np.linspace(-1, 1.5, 1000)
    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)
    h_val = pd.DataFrame({'x1':x_cord, 'x2':y_cord})
    h_val['hval'] = hfunc2(theta, h_val['x1'], h_val['x2'])
    decision = h_val[np.abs(h_val['hval']) < 2 * 10**-3]
    return decision.x1, decision.x2

In [23]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive2['Test 1'], positive2['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['Test 1'], negative2['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
x, y = find_decision_boundary(result2)
plt.scatter(x, y, c='y', s=10, label='Prediction')
ax.legend()
plt.show()

In [24]:
#进一步改变lamda，令其为0
lamda = 0
result3 = opt.fmin_tnc(func=costReg, x0=theta2, fprime=gradientReg, args=(X2, y2, lamda))

In [25]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive2['Test 1'], positive2['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['Test 1'], negative2['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
x, y = find_decision_boundary(result3)
plt.scatter(x, y, c='y', s=10, label='Prediction')
ax.legend()
plt.show()

In [26]:
#由上图可知出现了过拟合现象
#进一步改变lamda，令其为100
lamda = 100
result4 = opt.fmin_tnc(func=costReg, x0=theta2, fprime=gradientReg, args=(X2, y2, lamda))

In [27]:
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive2['Test 1'], positive2['Test 2'], s=50, c='b', marker='o', label='Accepted')
ax.scatter(negative2['Test 1'], negative2['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')
x, y = find_decision_boundary(result4)
plt.scatter(x, y, c='y', s=10, label='Prediction')
ax.legend()
plt.show()

In [None]:
#由上图可知出现了欠拟合现象