In [4]:
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn import datasets

In [40]:
iris = datasets.load_iris()
X = iris.data[:, :2]
y = (iris.target != 0) * 1

In [17]:
class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True, verbose=False):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
        self.verbose = verbose
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            if(self.verbose == True and i % 10000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'loss: {self.__loss(h, y)} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    
    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold

In [18]:
model = LogisticRegression(lr=0.1, num_iter=300000)
%time model.fit(X, y)

CPU times: user 3 s, sys: 0 ns, total: 3 s
Wall time: 3 s


In [20]:
preds = model.predict(X, 0.5)
# accuracy
(preds == y).mean()

1.0

In [19]:
from random import randrange
 
# Split a dataset into k folds

def cross_validation_split(dataset, folds=3):
    dataset_split = list()
    dataset_copy = list(dataset)
    fold_size = int(len(dataset) / folds)
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(dataset_copy))
            fold.append(dataset_copy.pop(index))
            dataset_split.append(fold)
    return dataset_split
 
# test cross validation split
#seed(1)
#dataset = [[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]
folds = cross_validation_split(iris, 5)
print(folds)

[['feature_names'], ['target'], ['target_names'], ['DESCR'], ['data']]


In [20]:
x=cross_validation_split(np.array([[1,2],[2,3],[4,5],[5,6]]), folds=3)

In [21]:
x

[[array([5, 6])], [array([1, 2])], [array([2, 3])]]

In [22]:
folds

[['feature_names'], ['target'], ['target_names'], ['DESCR'], ['data']]

In [54]:
def k_fold_cv(k,x):
    split=[]
    m=len(x)
    size=int(m/k)
    for alpha in range(k):
        x_new=x[alpha*size:(alpha+1)*size,:]
        split.append(x_new)
        x_new=0
    if m%k==0:
        return split
    else:
        split[0]=np.vstack((split[0],x[m-1,:]))
        return split    

In [55]:
t=k_fold_cv(2,X)

> <ipython-input-54-522578b82363>(8)k_fold_cv()
-> x_new=x[alpha*size:(alpha+1)*size,:]
(Pdb) c
> <ipython-input-54-522578b82363>(7)k_fold_cv()
-> pdb=pdb.set_trace()
(Pdb) c


In [None]:
import pdb
        pdb=pdb.set_trace()

In [46]:
t=t[0]==t[1]

In [77]:
from numpy import array
from sklearn.model_selection import KFold
# data sample
iris = datasets.load_iris()
X = iris.data[:, :2]
y = (iris.target != 0) * 1
# prepare cross validation
kfold = KFold(3, True, 1)
# enumerate splits
for train, test in kfold.split(X):
	print('train: %s, test: %s' % (X[train], X[test]))

train: [[5.1 3.5]
 [4.9 3. ]
 [4.7 3.2]
 [4.6 3.1]
 [4.6 3.4]
 [5.  3.4]
 [4.4 2.9]
 [4.9 3.1]
 [5.4 3.7]
 [4.8 3.4]
 [4.8 3. ]
 [4.3 3. ]
 [5.7 4.4]
 [5.4 3.4]
 [5.1 3.7]
 [4.6 3.6]
 [5.1 3.3]
 [4.8 3.4]
 [5.  3. ]
 [5.  3.4]
 [5.2 3.5]
 [4.8 3.1]
 [5.2 4.1]
 [4.9 3.1]
 [5.5 3.5]
 [4.9 3.6]
 [4.4 3. ]
 [5.1 3.4]
 [4.5 2.3]
 [5.  3.5]
 [5.1 3.8]
 [4.6 3.2]
 [5.  3.3]
 [7.  3.2]
 [6.9 3.1]
 [5.7 2.8]
 [4.9 2.4]
 [5.  2. ]
 [5.9 3. ]
 [6.  2.2]
 [6.1 2.9]
 [5.6 2.9]
 [6.7 3.1]
 [5.8 2.7]
 [6.2 2.2]
 [5.9 3.2]
 [6.1 2.8]
 [6.3 2.5]
 [6.4 2.9]
 [6.8 2.8]
 [5.7 2.6]
 [5.5 2.4]
 [5.5 2.4]
 [5.8 2.7]
 [6.  2.7]
 [6.  3.4]
 [6.7 3.1]
 [6.3 2.3]
 [5.6 3. ]
 [5.5 2.5]
 [5.  2.3]
 [5.7 3. ]
 [5.7 2.9]
 [6.2 2.9]
 [6.3 3.3]
 [5.8 2.7]
 [6.5 3. ]
 [7.6 3. ]
 [4.9 2.5]
 [7.3 2.9]
 [7.2 3.6]
 [6.5 3.2]
 [6.4 2.7]
 [5.7 2.5]
 [6.4 3.2]
 [6.5 3. ]
 [7.7 3.8]
 [5.6 2.8]
 [7.7 2.8]
 [6.3 2.7]
 [6.7 3.3]
 [6.2 2.8]
 [6.1 3. ]
 [7.2 3. ]
 [7.4 2.8]
 [6.4 2.8]
 [6.3 2.8]
 [6.1 2.6]
 [6.3 3.4]
 [6.4 3.1]
 [6

In [71]:
# enumerate splits
for train, test in kfold.split(X):
	print('train: %s, test: %s' % (train, test))

train: [  0   1   2   3   6   7   8   9  10  11  12  13  15  20  21  22  23  24
  25  26  27  30  32  34  36  37  38  39  41  43  46  47  49  50  52  55
  57  60  61  62  63  64  65  67  68  70  71  72  74  76  79  80  81  82
  83  85  86  87  88  89  93  95  96  97 100 101 104 105 106 107 109 110
 111 113 115 116 117 121 122 123 124 126 127 129 130 132 133 134 136 137
 138 139 140 142 143 144 145 147 148 149], test: [  4   5  14  16  17  18  19  28  29  31  33  35  40  42  44  45  48  51
  53  54  56  58  59  66  69  73  75  77  78  84  90  91  92  94  98  99
 102 103 108 112 114 118 119 120 125 128 131 135 141 146]
train: [  0   1   3   4   5   7   8  13  14  15  16  17  18  19  20  21  22  24
  25  26  28  29  30  31  33  35  37  40  41  42  43  44  45  47  48  49
  50  51  52  53  54  56  57  58  59  60  61  63  66  68  69  70  71  72
  73  75  76  77  78  79  80  81  82  84  86  88  90  91  92  94  96  98
  99 101 102 103 106 108 112 114 115 118 119 120 121 125 128 129 131 133
 13

In [62]:
from sklearn.model_selection import StratifiedKFold
import numpy as np
 
n_splits = 3
 
#X = np.ones(10)
y = np.arange(1,11,dtype=float)
 
# binning to make StratifiedKFold work
yc = np.outer(y[::n_splits],np.ones(n_splits)).flatten()[:len(y)]
yc[-n_splits:]=yc[-n_splits]*np.ones(n_splits)
 
skf = StratifiedKFold(n_splits=n_splits)
for train, test in skf.split(X, yc):
    print("train: %s test: %s" % (train, test))

ValueError: Found input variables with inconsistent numbers of samples: [150, 10]

In [78]:
1- Randomly split D into k subsets of size m/k
   for each model Mi
    for each j= 1,...,k
    Train Mi on  D-D_j
    Test Mij on D_j -----> ld_j(Mij)
    generalize error of Mi-----> Average of ld_j(Mij)
    
2- def k_fold_cv(k,x):
    split=[]
    m=len(x)
    size=int(m/k)
    for i in range(k):
        x_new=x[i*size:(i+1)*size,:]
        split.append(x_new)
        x_new=0
    if m%k=0:
        return split
    else:
        split[0]=np.vstack((split[0],x[m-1,:]))
        return split

SyntaxError: invalid syntax (<ipython-input-78-820371e971a3>, line 1)

In [79]:
def backward_elimination(data,target,significance_level=0.05):
    feature=data.columns.tolist()
    while(len(feature)>0):
        feature_with_constant=sm.add_contant(data[feature])
        p_value=sm.OLS(target,feature_with_constant).fit().p.value[1:]
        max_p_value=p.value.max()
        if max_p_value>=significance_level:
            exited_feature=p_value.idXmax()
            feature.remove(exited_feature)
        else:
            break
            return feature