Implement and train Softmax Regression with mini-batch SGD and early stopping.

The expected outcome.
* Implement Softmax Regression Model.
* Implement mini-batch SGD.
* The training should support early stopping.
* Train and evaluate the model with cross-validation. The evaluation metric is the *accuracy*.
* Retrain the model with early stopping.


**DO NOT USE SKLEARN**

In [181]:
import numpy as np
import pandas as pd 

from sklearn import datasets
from sklearn.model_selection import StratifiedShuffleSplit

np.random.seed(42)

In [182]:
iris = datasets.load_iris()
X = iris["data"]
y = iris["target"]
df = pd.DataFrame({fname: values for fname, values in zip(iris["feature_names"], X.T)})
df["target"] = y

df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


## Your Code
You can start writing your code from here. Please don't modify any of the previous code.

---
# Exploraring Data:


In [183]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   target             150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [184]:
features = df.columns.tolist()
features.remove('target')
print(features)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [185]:
# Check Unique
df["target"].unique()

array([0, 1, 2])

So, it's a multi-class Classification, so one-hot encoding is required

---
# Splitting Data:

Split Data randomly to training set, validiation set and Testing set.

### Required Libararies:

In [186]:
# First we need to fix random seed:
import random

random.seed(42)

In [187]:
def train_test_split(dataset, test_size = 0.2):
    """ 
    Split dataset into train-test sets
    
    Parameters:
    ----------
    dataset : list
              Vector size (m, 1)
    
    test_size : float
                Specify the ratio of test set.
                
    Returns:
    -------
    train : list
            The training set.
            
    dataset_copy : list.
                   The test set.
                    
    """
    
    train = list()
    train_size = (1-test_size) * len(dataset)
    dataset_copy = list(dataset)
    while len(train) < train_size:
        index = random.randrange(len(dataset_copy))
        train.append(dataset_copy.pop(index))
    return train, dataset_copy


In [188]:
# Firstly Split all dataset into Training and testing set.
X_ , X_test = train_test_split(X, test_size = 0.2)
y_ , y_test = train_test_split(y, test_size = 0.2)

# Secondly split the training set and validation set.
X_train , X_valid = train_test_split(X_, test_size = 0.2)
y_train , y_valid = train_test_split(y_, test_size = 0.2)

# Output sets are of type List

# Printing
print("shape X_train = ", pd.DataFrame(X_train).shape,"\tX_valid = ", pd.DataFrame(X_valid).shape,"\tX_test = ", pd.DataFrame(X_test).shape)
print("shape y_train = ", pd.DataFrame(y_train).shape,"\ty_valid = ", pd.DataFrame(y_valid).shape,"\ty_test = ", pd.DataFrame(y_test).shape)

shape X_train =  (96, 4) 	X_valid =  (24, 4) 	X_test =  (30, 4)
shape y_train =  (96, 1) 	y_valid =  (24, 1) 	y_test =  (30, 1)


---
# One-Hot Encoding Implementation:

In [189]:
def oneHotEncoding(col):
    """ 
    Encode column into encoded matrix corresponds to the input col.
    
    Parameters:
    ----------
    col : List
          Vector size (m, 1)
    
    Returns:
    -------
    softmax_arr : list
                  The output will be a sparse matrix where each column corresponds to one possible value of one feature.
    """
    
    col_len =len(col)
    n = max(col) + 1 # +1 : At least one column
    
    # Create Zeros Matrix of rows = size, columns = n
    oneHot_x = np.zeros((col_len, n)) 
    
    # Assigning 1 for each Category and let others to 0.
    oneHot_x[np.arange(col_len), col] = 1
    
    return oneHot_x

In [190]:
# Testing:
oneHotEncoding(y)

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0

---
## Implement Softmax Regression Model.

$$\mathrm{softmax}(\mathbf{X})_{ij} = \frac{\exp(\mathbf{X}_{ij})}{\sum_k \exp(\mathbf{X}_{ik})}.$$

In [191]:
def softmax(x):
    """ 
    Compute softmax values for each sets of scores in x.
    
    Parameters:
    ----------
    X : List
        the dataset size (m, n).
    
    Returns:
    -------
    softmax_arr : List
                  The softmax values for the input array list.
                  
    Note:
    ----
    Subtracting max(x) from x leaves a vector that has only non-positive entries, 
    ruling out overflow and at least one element that is zero ruling out a vanishing denominator.
    """
    # subtracting the max of x for numerical stability.
    exp = np.exp(x - np.max(x))

    # Calculating softmax for all examples.
    for i in range(len(x)):
        exp[i] /= np.sum(exp[i])
        
    return exp    

---
# Cross-Validation Implementation

Split a dataset into k folds

In [192]:
def cross_validation_split(dataset, folds=3):
    """ 
    Split dataset into K-folds.
    
    Parameters:
    ----------
    dataset : List
              The dataset size (m, n).
    
    folds : Integer
            The number of folds to split the dataset.
            
    Returns:
    -------
    dataset_split : List
                    The splitted datasets.
                  
    """
    
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds)
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = random.randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
        dataset_split.append(fold)
    return dataset_split
  

In [193]:
# test cross validation split
dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]
folds = cross_validation_split(dataset, 4)
print(folds)  

[[[3], [5]], [[4], [10]], [[2], [8]], [[1], [6]]]


---
## Implement mini-batch SGD.

In [194]:
def SGDMiniBatch(x, y, epochs, batch_size = 4, lr = 0.1, percision = 0.1):
    
    """ 
    Stocastic mini batch Gradient decent implementation to get the best coefficients.
    
    Parameters:
    ----------
    x : List
        Feature list, size (m, n).
    
    y : List
        Target list.
    
    epochs : Integer
             Number of maximum iteration.
    
    batch_size : Integer
                 Number of batches to split into.
                 
    lr : Float
         Learning Rate factor. Default value = 0.1
         
    percision : Float
                Percision to stop the gradient decent
    
    Returns:
    -------
    theta : List
            Coefficient List.
                  
    """
    
    # First define theta Matrix, rows : coef of each feature of x. cols : coef for each output y encoded.
    theta_rows = np.array(X_train).shape[1]
    #theta_cols = np.array(y_train).shape[1]
    theta = np.zeros((theta_rows + 1, 1),dtype = np.float64) 
    
    y_batch = np.array_split(y, batch_size)
    
    # Adding First col of ones(Bias)
    X = np.column_stack((np.ones(len(x)),x))
    X_batchs = np.array_split(X,batch_size)
    
    best_loss = np.infty
    
    for i in range(epochs): 
        # Looping over each barch.
        for miniIndex in range(batch_size):
            # Selecting mini Batch
            mini_x, mini_y = X_batchs[miniIndex], y_batch[miniIndex]
    
            # Encode the target batch
            y_oneHot = oneHotEncoding(mini_y)
            
            # Calculate hypothes.
            hypoth = mini_x.dot(theta)
            
            # get Softmax of hypoth
            y_proba = softmax(hypoth)
 
            cost = -np.mean(np.log1p(y_proba))
            
            error = y_proba - y_oneHot
            
            gradient = (1/(len(mini_x))) * (mini_x.T.dot(error))
            
            theta = theta - lr*gradient
            
            if cost < best_loss:
                best_loss = cost
                if cost < percision: 
                    print("Iteration #",i, cost, "early stopping!")
                    break
             
    return theta

In [195]:
# Testing:
theta = SGDMiniBatch(X_train,y_train,500,5,0.1,0.1)

Iteration # 0 -0.6931471805599453 early stopping!


In [196]:
theta

array([[ 2.55992187, -0.85002504, -1.90989683],
       [-1.56298807,  0.23318396,  0.20280411],
       [ 0.63951006, -1.0842149 , -0.20529517],
       [ 0.49787864, -0.65169692, -0.43918172],
       [ 0.49673609, -0.35734685, -0.30638925]])

# Prediction

In [197]:
def predict(x, theta):
    """ 
    predict target for samples
    
    Parameters:
    ----------
    x : List
        Feature list, size (m, n).
    
    theta : List
            Coeficients list.
    
    Returns:
    -------
    y_predict : List
                Predicted Target List.
                  
    """
     # Adding First col of ones(Bias)
    X = np.column_stack((np.ones(len(x)),x))
    
    s = X.dot(theta)
    
    y_proba = softmax(s)
    y_predict = np.argmax(y_proba, axis=1)
    
    return y_predict

In [198]:
# Test Predict:
y_predict = predict(X_valid,theta)
y_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2,
       0, 2], dtype=int64)

# Accuracy Score

In [199]:
def score_accuracy(y_predict, y_true):
    """ 
    compute Accuracy between predicted and true values.
    
    Parameters:
    ----------
    y_predict : List
                Perdicted list, size (m, n).
    
    y_true : List
             Target list.
    
    Returns:
    -------
    accuracy_score : float
                     Accuracy Value.
                  
    """
    accuracy_score = np.mean(y_predict == y_true)
    return accuracy_score

In [200]:
# Test Score:
score = score_accuracy(y_predict, y_valid)
score

0.4166666666666667

In [201]:
y_test_predict = predict(X_test,theta)
score = score_accuracy(y_test_predict, y_test)
score

0.2

In [202]:
y_test_predict

array([2, 0, 0, 0, 2, 0, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

Using the following cell to train and evaluate your model.

In [203]:
split = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)

for train_index, test_index in split.split(df, df["target"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
    
    # Use strat_train_set and strat_test_set to train and evaluate your model
    xtrain=strat_train_set.iloc[:,:4].values
    ytrain=strat_train_set.iloc[:,4].values

    xtest=strat_test_set.iloc[:,:4].values
    ytest=strat_test_set.iloc[:,4].values
    
    # Fit to model : choose parameters
    epochs, batch_size, learning_rate, percision = 1000, 4, 0.01, 0.001
    theta = SGDMiniBatch(xtrain, ytrain, epochs, batch_size, learning_rate, percision)
    
    # Prediction
    y_predict = predict(xtest,theta)
    
    # Accuracy Score
    score = score_accuracy(y_predict, ytest)
    
    print(y_predict)
    print("Accuracy =", score)
    print("-"*20)

Iteration # 0 -0.693147180559945 early stopping!
[0 2 1 1 0 1 0 0 2 2 2 2 2 1 0 0 0 1 1 2 0 2 1 2 2 1 1 0 2 0]
Accuracy = 0.9666666666666667
--------------------
Iteration # 0 -0.693147180559945 early stopping!
[2 2 1 2 0 2 0 1 0 2 0 0 2 1 2 1 0 0 1 2 0 2 1 0 1 2 2 0 2 2]
Accuracy = 0.9
--------------------
Iteration # 0 -0.693147180559945 early stopping!
[2 0 2 2 1 0 1 0 0 2 0 1 2 0 0 2 2 2 1 2 1 0 0 1 2 1 2 1 0 2]
Accuracy = 0.9333333333333333
--------------------


---
# Testing SoftMax Regression Using Sklearn

In [204]:
from sklearn.linear_model import LogisticRegression

# Fit softmax classifier
"""
    if multi_class is set to be “multinomial”,
    the softmax function is used to find the predicted probability of each class
"""
lr_mn = LogisticRegression(multi_class = "multinomial")
lr_mn.fit(pd.DataFrame(X_train), pd.DataFrame(y_train))

print("Softmax training accuracy:", lr_mn.score(X_train, y_train))
print("Softmax validation accuracy:", lr_mn.score(X_valid, y_valid))
print("Softmax test accuracy    :", lr_mn.score(X_test, y_test))

Softmax training accuracy: 0.5208333333333334
Softmax validation accuracy: 0.4583333333333333
Softmax test accuracy    : 0.13333333333333333


  return f(*args, **kwargs)


In [205]:
split = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)
ahmed = 1
for train_index, test_index in split.split(df, df["target"]):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]
    
    # Use strat_train_set and strat_test_set to train and evaluate your model
    xtrain=strat_train_set.iloc[:,:4].values
    ytrain=strat_train_set.iloc[:,4].values

    xtest=strat_test_set.iloc[:,:4].values
    ytest=strat_test_set.iloc[:,4].values
    
    lr_mn = LogisticRegression(multi_class = "multinomial")
    lr_mn.fit(pd.DataFrame(xtrain), pd.DataFrame(ytrain))

    print("Softmax training accuracy:", lr_mn.score(xtrain, ytrain))
    #print("Softmax validation accuracy:", lr_mn.score(X_valid, y_valid))
    print("Softmax test accuracy    :", lr_mn.score(xtest, ytest))

Softmax training accuracy: 0.975
Softmax test accuracy    : 0.9666666666666667
Softmax training accuracy: 0.975
Softmax test accuracy    : 0.9666666666666667
Softmax training accuracy: 0.9833333333333333
Softmax test accuracy    : 0.9666666666666667


  return f(*args, **kwargs)
  return f(*args, **kwargs)
  return f(*args, **kwargs)
