<h1 align="center"> 
DATS 6103 - 10, Summer 2018, Homework_4_solution
</h1> 

<h1 align="center"> 
Due June 19, 11:59 PM
</h1> 

<h4 align="center"> 
Author: Yuxiao Huang ([yuxiaohuang@gwu.edu](mailto:yuxiaohuang@gwu.edu))
</h4>

# Note
- Complete the missing parts indicated by # Implement me
- Submit an ipynb file named Homework_4.ipynb to [blackboard](https://blackboard.gwu.edu) folder /Assignments/Homework_4/
-  We expect you to follow a reasonable programming style. While we do not mandate a specific style, we require that your code to be neat, clear, **documented/commented** and above all consistent. **Marks will be deducted if these are not followed.**

# Overview
- Apply logistic regression on the [Breast Cancer Wisconsin (Diagnostic)](http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data) dataset
- The goal is to evaluate the logistic regression model implemented by you and in [sklearn](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html), by comparing their prediction accuracy
- Implement the parts indicated by # Implement me

# Load data

In [1]:
import pandas as pd

# Load the hourly-based data
df = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


# Remove the first column 

In [2]:
df.drop([0], axis=1, inplace=True)
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


# Removing rows with missing values

In [3]:
import numpy as np

print('Number of rows before removing rows with missing values: ' + str(df.shape[0]))

# Replace ? with np.NaN
df.replace('?', np.NaN, inplace=True)

# Removing rows with np.NaN
df.dropna(how='any', inplace=True)

print('Number of rows after removing rows with missing values: ' + str(df.shape[0]))

Number of rows before removing rows with missing values: 699
Number of rows after removing rows with missing values: 683


# Get the data of the features and target

In [4]:
# Get the feature vector
X = df.iloc[:, :-1].values

# Get the target vector
y = df.iloc[:, -1:].values.ravel()

# Divide the data into training and testing

In [5]:
from sklearn.model_selection import train_test_split

# Randomly choose 30% of the data for testing (set randome_state as 0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Standardize the features

In [6]:
from sklearn.preprocessing import StandardScaler

# Declare the standard scaler
std_scaler = StandardScaler()

# Standardize the training set
X_train = std_scaler.fit_transform(X_train)

# Standardize the testing set
X_test = std_scaler.transform(X_test)



# The logistic regression class

In [7]:
class MyLogisticRegression():
    """The logistic regression class"""
    
    def __init__(self, eta=0.01, n_iter=10):
        self.eta = eta
        self.n_iter = n_iter

    def fit(self, X, y):
        """
        The fit function
        
        Parameters
        ----------
        X : the feature vector
        y : the target vector
        """
        
        self.w_ = {}
        # For each class label
        for class_ in np.unique(y):
            self.w_[class_] = np.zeros(1 + X.shape[1])

        for _ in range(self.n_iter):
            # For each class label
            for class_ in self.w_.keys():
                delta_w = np.zeros(1 + X.shape[1])

                # For each sample
                for i in range(X.shape[0]): 
                    # Calculate the net_input
                    z = self.net_input(X, class_, i)
                    
                    # Compute logistic sigmoid activation 
                    prob = self.activation(z)
                    
                    # Get the error
                    error = (1 if y[i] == class_ else 0) - prob
                    
                    # Update delta_w[1:]
                    for j in range(1, X.shape[1] + 1):
                        delta_w[j] += self.eta * error * X[i][j - 1]

                    # Update delta_w[0]
                    delta_w[0] += self.eta * error

                # Update self.w_
                self.w_[class_] += delta_w

    def net_input(self, X, class_, i):
        """
        Calculate the net input
        
        Parameters
        ----------
        X : the feature vector
        class_ : a class label of the target
        
        Returns
        ----------
        The net input
       
        """
        
        weighted_sum = self.w_[class_][0]
        
        # For each feature
        for j in range(1, X.shape[1] + 1):
            weighted_sum += X[i][j - 1] * self.w_[class_][j]

        return weighted_sum
    
    def activation(self, z):
        """
        Compute logistic sigmoid activation
        Reference: the function is from the "Python Machine Learning (2nd edition)" book code repository and info resource
        https://github.com/rasbt/python-machine-learning-book-2nd-edition
        """
        return 1. / (1. + np.exp(-np.clip(z, -250, 250)))

    def predict(self, X):
        """
        The predict function
        
        Parameters
        ----------
        X : the feature vector
        
        Returns
        ----------
        The predicted class label of the target
        """
        
        # The list of predicted class label
        y_pred = []
        
        # For each sample
        for i in range(X.shape[0]):
            # The list of [probability, class]
            prob_classes = []
            
            for class_ in self.w_.keys():
                # Calculate the net_input
                z = self.net_input(X, class_, i)

                # Compute logistic sigmoid activation 
                prob = self.activation(z)
                
                # Update prob_classes
                prob_classes.append([prob, class_])
                
            # Sort prob_classes in descending order of probability
            prob_classes = sorted(prob_classes, key=lambda x: x[0], reverse=True)
            
            # Get the predicted class label (the one with the largest probability)
            pred_class = prob_classes[0][1]
            
            # Update y_pred
            y_pred.append(pred_class)
    
        return y_pred

# Train the logistic regression model implemented by you

In [8]:
lr = MyLogisticRegression()

# Train the model
lr.fit(X_train, y_train)

# Test the model

In [9]:
from sklearn.metrics import precision_recall_fscore_support

# Test the model
y_pred = lr.predict(X_test)

# Calculate the precision, recall, and fscore
precision_recall_fscore_support(y_test, y_pred, average='micro')

(0.95609756097560972, 0.95609756097560972, 0.95609756097560972, None)

# Train the logistic regression model implemented in sklearn

In [10]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

# Train the model
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

# Test the model

In [11]:
# Test the model
y_pred = lr.predict(X_test)

# Calculate the precision, recall, and fscore
precision_recall_fscore_support(y_test, y_pred, average='micro')

(0.95609756097560972, 0.95609756097560972, 0.95609756097560972, None)