In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn import linear_model

%matplotlib inline

In [None]:
import os
path = os.getcwd() + '/data/ex2data1.txt'
data = pd.read_csv(path, header=None, names=['Exam1', 'Exam2', 'Admitted'])
data.head()

In [None]:
positive = data[data.Admitted.isin([1])]
negative = data[data.Admitted.isin([0])]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive.Exam1, positive.Exam2, s=50, marker='o', c='g', label='Admitted')
ax.scatter(negative.Exam1, negative.Exam2, s=50, marker='x', c='r', label='Not Admitted')
ax.set_xlabel('Exam 1')
ax.set_ylabel('Exam 2')
ax.legend()

In [None]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

sigmoid_x = np.linspace(-10,10,100)
plt.plot(sigmoid_x, [sigmoid(x) for x in sigmoid_x])

In [None]:
data.insert(0,'Bias',1)

In [None]:
X = np.matrix(data.iloc[:,0:-1].values)
y = np.matrix(data.iloc[:,-1].values).T

In [None]:
def costFunction(X,y,theta):
    return 1/len(X) * np.sum(np.multiply(-y, np.log(sigmoid(X*theta.T))) - np.multiply((1-y), np.log(1-sigmoid(X*theta.T))))

In [None]:
theta = np.matrix(np.array([0, 0,0]))
costFunction(X,y,theta)

In [None]:
def gradient(X, y, theta):
    gradient = np.zeros(X.shape[1])
    
    error = sigmoid(X*theta.T) - y
    for i in range(len(gradient)):            
        gradient[i] = 1/len(X) * np.sum(np.multiply(error, X[:,i]))

    return gradient

In [None]:
def gradientVectorized(X, y, theta):
    gradient = np.zeros(X.shape[1])
    
    error = sigmoid(X*theta.T) - y
    gradient= 1/len(X) * X.T * error

    return gradient

In [None]:
gradient(X, y, theta)

In [None]:
gradientVectorized(X, y, theta)

In [None]:
#import scipy.optimize as opt
#result = opt.fmin_tnc(func=costFunction, x0=theta, fprime=gradient, args=(X, y))
#result

Sciklearn Solution

In [None]:
logistic = sklearn.linear_model.LogisticRegression()

In [None]:
logistic.fit(X,y)
sklearn.metrics.accuracy_score(logistic.predict(X), y)

In [None]:
logistic.coef_

In [None]:
from sklearn.tree import DecisionTreeClassifier
import sklearn.linear_model
import sklearn.svm

def plot_decision_surface(clf, X_train, Y_train, positive, negative):
    plot_step=1
    
    if X_train.shape[1] != 2:
        raise ValueError("X_train should have exactly 2 columnns!")
    
    x_min, x_max = X_train[:, 0].min() - plot_step, X_train[:, 0].max() + plot_step
    y_min, y_max = X_train[:, 1].min() - plot_step, X_train[:, 1].max() + plot_step
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    clf.fit(X_train,Y_train)
    if hasattr(clf, 'predict_proba'):
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
    else:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])    
    Z = Z.reshape(xx.shape)
    cs = plt.contourf(xx, yy, Z, cmap=plt.cm.Greens)
    plt.scatter(positive.iloc[:,0].values, positive.iloc[:,1].values, s=50, marker='o', c='g')
    plt.scatter(negative.iloc[:,0].values, negative.iloc[:,1].values, s=50, marker='x', c='r')
    
    plt.show()

In [None]:
plot_decision_surface(logistic, data[['Exam1','Exam2']].values, y, positive, negative)

Logistic Regression with Regularization

In [None]:
path = os.getcwd() + '/data/ex2data2.txt'
data2 = pd.read_csv(path, header=None, names=['Test 1', 'Test 2', 'Accepted'])
orig_data2 = data2.copy()
data2.head()

In [None]:
positive2 = data2[data2['Accepted'].isin([1])]
negative2 = data2[data2['Accepted'].isin([0])]

fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(positive2['Test 1'], positive2['Test 2'], s=50, c='g', marker='o', label='Accepted')
ax.scatter(negative2['Test 1'], negative2['Test 2'], s=50, c='r', marker='x', label='Rejected')
ax.legend()
ax.set_xlabel('Test 1 Score')
ax.set_ylabel('Test 2 Score')

In [None]:
degree = 5
x1 = data2['Test 1']
x2 = data2['Test 2']

data2.insert(3, 'Ones', 1)

for i in range(1, degree):
    for j in range(0, i):
        data2['F' + str(i) + str(j)] = np.power(x1, i-j) * np.power(x2, j)

data2.drop('Test 1', axis=1, inplace=True)
data2.drop('Test 2', axis=1, inplace=True)

data2.head()

In [None]:
# set X and y (remember from above that we moved the label to column 0)
X2 = data2.iloc[:,1:]
y2 = data2.iloc[:,0:1]

# convert to numpy arrays and initalize the parameter array theta
X2 = np.matrix(np.array(X2.values))
y2 = np.matrix(np.array(y2.values))
theta2 = np.matrix(np.zeros(11))

In [None]:
def costReg(X, y, theta, lam):
    cost = np.sum(np.multiply(-y,np.log(sigmoid(X*theta.T))) - np.multiply((1-y),np.log(1-sigmoid(X*theta.T))))
    reg = lam / 2 / len(X) * np.sum(np.power(theta[1:],2))
    return cost / len(X) + reg 

In [None]:
costReg(X2, y2, theta2, 1)

In [None]:
def costReg2(X, y, theta, learningRate):
    first = np.multiply(-y, np.log(sigmoid(X * theta.T)))
    second = np.multiply((1 - y), np.log(1 - sigmoid(X * theta.T)))
    reg = (learningRate / 2 / len(X)) * np.sum(np.power(theta[:,1:theta.shape[1]], 2))
    return np.sum(first - second) / (len(X)) + reg

In [None]:
costReg2(X2, y2, theta2, 1)

**Sklearn Regularized Logistic Regression Solution**

In [None]:
from sklearn import linear_model
model = linear_model.LogisticRegression(penalty='l2', C=1.0)
model.fit(X2, y2)
model.coef_