# 1：逻辑回归
### **by:MLZZY**
**实验描述：构建一个逻辑回归模型来预测某个学生是否被学校录取。
数据集中包含学生的两次测试成绩和录取的结果。**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
path = '/home/kesci/input/andrew_ml_ex22391/ex2data1.txt'
data = pd.read_csv(path, header=None, names=['Exam 1', 'Exam 2', 'Admitted'])
data.head()

Unnamed: 0,Exam 1,Exam 2,Admitted
0,34.62366,78.024693,0
1,30.286711,43.894998,0
2,35.847409,72.902198,0
3,60.182599,86.308552,1
4,79.032736,75.344376,1


In [3]:
positive = data[data['Admitted'].isin([1])]
negative = data[data['Admitted'].isin([0])]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()

In [4]:
# 实现sigmoid函数
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [5]:
# 实现代价函数
def cost(theta, X, y):
    theta = np.matrix(theta)
    X = np.matrix(X)
    y = np.matrix(y)
    f1=np.log(sigmoid(X * theta.T))
    first = np.multiply(-y,f1)
    s1=np.log(1 - sigmoid(X * theta.T))
    second = np.multiply((1 - y), s1)
    return np.sum(first - second) / (len(X))

In [6]:
# 加一列常数列
data.insert(0, 'Ones', 1)
# 初始化X，y，θ
cols = data.shape[1]
X = data.iloc[:,0:cols-1]
y = data.iloc[:,cols-1:cols]
theta = np.zeros(3)
# 转换X，y的类型
X = np.array(X.values)
y = np.array(y.values)

In [7]:
X.shape,theta.shape,y.shape

((100, 3), (3,), (100, 1))

In [8]:
cost(theta,X,y)

0.6931471805599453

In [12]:
def gradient(theta,X,y):
    theta=np.matrix(theta)
    X=np.matrix(X)
    y=np.matrix(y)
    parameters=int(theta.ravel().shape[1])
    grad=np.zeros(parameters)
    error=sigmoid(X*theta.T)-y
    for i in range(parameters):
        term=np.multiply(error,X[:,i])
        grad[i]=np.sum(term)/len(X)
    return grad

In [13]:
#调用函数实现梯度下降，无需手动定义迭代次数和步长
import scipy.optimize as opt
#func：优化的目标函数   x0：初值    fprime：func的梯度函数    args：元组，是传递给优化函数的参数
result=opt.fmin_tnc(func=cost,x0=theta,fprime=gradient,args=(X,y))
result

(array([-25.16131861,   0.20623159,   0.20147149]), 36, 0)

In [14]:
cost(result[0],X,y)

0.2034977015894746

In [16]:
plotting_x1=np.linspace(30,100,100)
plotting_h1=(-result[0][0]-result[0][1] * plotting_x1) / result[0][2]
fig, ax = plt.subplots(figsize=(12,8))
ax.plot(plotting_x1, plotting_h1, 'y', label='Prediction')
ax.scatter(positive['Exam 1'], positive['Exam 2'], s=50, c='b', marker='o', label='Admitted')
ax.scatter(negative['Exam 1'], negative['Exam 2'], s=50, c='r', marker='x', label='Not Admitted')
ax.legend()
ax.set_xlabel('Exam 1 Score')
ax.set_ylabel('Exam 2 Score')
plt.show()

In [23]:
# 实现hθ
def hfunc1(theta, X):
    return sigmoid(np.dot(theta.T, X))
hfunc1(result[0],[1,45,85])#预测出的该学生被录取的概率

0.7762906228157445

In [24]:
def predict(theta,X):
    probability=sigmoid(X*theta.T)
    return [1 if x>=0.5 else 0 for x in probability]

In [26]:
#统计预测的正确率
theta_min=np.matrix(result[0])
predictions=predict(theta_min,X)
correct = [1 if ((a == 1 and b == 1) or (a == 0 and b == 0)) else 0 for (a, b) in zip(predictions, y)]
accuracy = (sum(correct) / len(correct))
accuracy

0.89