In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd 
import math
from patsy import dmatrices


import statsmodels.discrete.discrete_model as sm
import statsmodels.formula.api as smf
from statsmodels.graphics.regressionplots import *

from sklearn import preprocessing
from sklearn import datasets, linear_model
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier as KNN

4.6.1

In [2]:
Smarket = pd.read_csv('./Smarket.csv', header=0)
print(Smarket.head())
print(list(Smarket))
print(Smarket.shape)

   Year   Lag1   Lag2   Lag3   Lag4   Lag5  Volume  Today Direction
0  2001  0.381 -0.192 -2.624 -1.055  5.010  1.1913  0.959        Up
1  2001  0.959  0.381 -0.192 -2.624 -1.055  1.2965  1.032        Up
2  2001  1.032  0.959  0.381 -0.192 -2.624  1.4112 -0.623      Down
3  2001 -0.623  1.032  0.959  0.381 -0.192  1.2760  0.614        Up
4  2001  0.614 -0.623  1.032  0.959  0.381  1.2057  0.213        Up
['Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume', 'Today', 'Direction']
(1250, 9)


In [3]:
print(Smarket.corr())

            Year      Lag1      Lag2      Lag3      Lag4      Lag5    Volume  \
Year    1.000000  0.029700  0.030596  0.033195  0.035689  0.029788  0.539006   
Lag1    0.029700  1.000000 -0.026294 -0.010803 -0.002986 -0.005675  0.040910   
Lag2    0.030596 -0.026294  1.000000 -0.025897 -0.010854 -0.003558 -0.043383   
Lag3    0.033195 -0.010803 -0.025897  1.000000 -0.024051 -0.018808 -0.041824   
Lag4    0.035689 -0.002986 -0.010854 -0.024051  1.000000 -0.027084 -0.048414   
Lag5    0.029788 -0.005675 -0.003558 -0.018808 -0.027084  1.000000 -0.022002   
Volume  0.539006  0.040910 -0.043383 -0.041824 -0.048414 -0.022002  1.000000   
Today   0.030095 -0.026155 -0.010250 -0.002448 -0.006900 -0.034860  0.014592   

           Today  
Year    0.030095  
Lag1   -0.026155  
Lag2   -0.010250  
Lag3   -0.002448  
Lag4   -0.006900  
Lag5   -0.034860  
Volume  0.014592  
Today   1.000000  


4.6.2

In [4]:
y, x = dmatrices('Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume', Smarket, return_type = 'dataframe')
print(y)

      Direction[Down]  Direction[Up]
0                 0.0            1.0
1                 0.0            1.0
2                 1.0            0.0
3                 0.0            1.0
4                 0.0            1.0
...               ...            ...
1245              0.0            1.0
1246              1.0            0.0
1247              0.0            1.0
1248              1.0            0.0
1249              1.0            0.0

[1250 rows x 2 columns]


In [5]:
logit = sm.Logit(y.iloc[:, 1], x)
print(logit.fit().summary())

Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:          Direction[Up]   No. Observations:                 1250
Model:                          Logit   Df Residuals:                     1243
Method:                           MLE   Df Model:                            6
Date:                Sun, 18 Oct 2020   Pseudo R-squ.:                0.002074
Time:                        18:40:36   Log-Likelihood:                -863.79
converged:                       True   LL-Null:                       -865.59
Covariance Type:            nonrobust   LLR p-value:                    0.7319
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1260      0.241     -0.523      0.601      -0.598       0.346
Lag1          -0.0731      0.

In [6]:
print(logit.fit().params)

Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4
Intercept   -0.126000
Lag1        -0.073074
Lag2        -0.042301
Lag3         0.011085
Lag4         0.009359
Lag5         0.010313
Volume       0.135441
dtype: float64


In [7]:
print(logit.fit().predict()[0:11])

Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4
[0.50708413 0.48146788 0.48113883 0.51522236 0.51078116 0.50695646
 0.49265087 0.50922916 0.51761353 0.48883778 0.4965211 ]


In [8]:
predict_label = pd.DataFrame(np.zeros(shape=(1250, 1)), columns = ["label"])
predict_label.iloc[logit.fit().predict()>0.5] = 1
print(confusion_matrix(y.iloc[:,1], predict_label.iloc[:,0]))

Optimization terminated successfully.
         Current function value: 0.691034
         Iterations 4
[[145 457]
 [141 507]]


In [9]:
print(np.mean(y.iloc[:, 1] == predict_label.iloc[:, 0]) * 100, "%")

52.16 %


In [10]:
Smarket_2005 = Smarket.query('Year >= 2005')
Smarket_train = Smarket.query('Year < 2005')
y_train, x_train = dmatrices('Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume', Smarket_train, return_type = 'dataframe')
y_test, x_test = dmatrices('Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume', Smarket_2005, return_type = 'dataframe')
logit = sm.Logit(y_train.iloc[:,1], x_train)
print(logit.fit().summary())

Optimization terminated successfully.
         Current function value: 0.691936
         Iterations 4
                           Logit Regression Results                           
Dep. Variable:          Direction[Up]   No. Observations:                  998
Model:                          Logit   Df Residuals:                      991
Method:                           MLE   Df Model:                            6
Date:                Sun, 18 Oct 2020   Pseudo R-squ.:                0.001562
Time:                        18:40:36   Log-Likelihood:                -690.55
converged:                       True   LL-Null:                       -691.63
Covariance Type:            nonrobust   LLR p-value:                    0.9044
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.1912      0.334      0.573      0.567      -0.463       0.845
Lag1          -0.0542      0.

In [11]:
preds = logit.fit().predict(x_test)
predict_label = pd.DataFrame(np.zeros(shape=(x_test.shape[0],1)), columns = ['label'])
threshold = 0.5
mark = (preds > threshold).reset_index(drop=True)
predict_label.iloc[mark] = 1
print(confusion_matrix(y_test.iloc[:,1], predict_label.iloc[:,0]))

Optimization terminated successfully.
         Current function value: 0.691936
         Iterations 4
[[77 34]
 [97 44]]


In [12]:
print(np.mean(y_test.iloc[:,1].reset_index(drop=True) == predict_label.iloc[:,0].reset_index(drop=True)) * 100, "%")

48.01587301587302 %


In [13]:
preds = logit.fit().predict(x_test)
predict_label = pd.DataFrame(np.zeros(shape=(x_test.shape[0],1)), columns = ['label'])
threshold = 0.45
predict_label.iloc[(preds > threshold).reset_index(drop=True)] = 1
confusion_matrix(y_test.iloc[:,1], predict_label.iloc[:,0])
print(np.mean(y_test.iloc[:,1].reset_index(drop=True) == predict_label.iloc[:,0].reset_index(drop=True)))

Optimization terminated successfully.
         Current function value: 0.691936
         Iterations 4
0.5674603174603174


In [14]:
y_train, x_train = dmatrices('Direction~Lag1+Lag2', Smarket_train, return_type = 'dataframe')
y_test, x_test = dmatrices('Direction~Lag1+Lag2', Smarket_2005, return_type = 'dataframe')
logit = sm.Logit(y_train.iloc[:,1], x_train)
preds = logit.fit().predict(x_test)
predict_label = pd.DataFrame(np.zeros(shape=(x_test.shape[0],1)), columns = ['label'])
threshold = 0.5
confusion_matrix(y_test.iloc[:,1], predict_label.iloc[:,0])
print(np.mean(y_test.iloc[:,1].reset_index(drop=True) == predict_label.iloc[:,0].reset_index(drop=True)))

Optimization terminated successfully.
         Current function value: 0.692085
         Iterations 3
0.44047619047619047


4.6.3

In [15]:
sklearn_lda = LDA()
lda = sklearn_lda.fit(x_train.iloc[:,1:3], y_train.iloc[:,1])
x_lda = lda.transform(x_train.iloc[:,1:3])
x_labels = lda.predict(x_train.iloc[:,1:3])
x_prob = lda.predict_proba(x_train.iloc[:,1:3])
x_test_labels=lda.predict(x_test.iloc[:,1:3])
x_test_prob = lda.predict_proba(x_test.iloc[:,1:3])
print(np.mean(y_test.iloc[:, 1] == x_test_labels) * 100, "%")

55.952380952380956 %


In [16]:
print("0.5: ", np.mean(y_test.iloc[:,1] == (x_test_prob[:,1]>=0.5)) * 100, "%\n0.48:",np.mean(y_test.iloc[:,1] == (x_test_prob[:,1]>=0.48)) * 100, "%")

0.5:  55.952380952380956 %
0.48: 56.34920634920635 %


4.6.4

In [17]:
sklearn_qda = QDA(priors=None, store_covariance=True)
qda = sklearn_qda.fit(x_train.iloc[:,1:3], y_train.iloc[:,1])
x_labels = qda.predict(x_train.iloc[:,1:3])
x_prob = qda.predict_proba(x_train.iloc[:,1:3])
x_test_labels = qda.predict(x_test.iloc[:,1:3])
x_test_prob = qda.predict_proba(x_test.iloc[:,1:3]) 
print(np.mean(y_test.iloc[:,1] == x_test_labels) * 100, "%")

59.92063492063492 %


4.6.4

In [18]:
print(dir(qda))
print(qda.means_)
print(qda.store_covariance)

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_n_features', '_decision_function', '_estimator_type', '_get_param_names', '_get_tags', '_more_tags', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_validate_data', 'classes_', 'covariance_', 'decision_function', 'fit', 'get_params', 'means_', 'n_features_in_', 'predict', 'predict_log_proba', 'predict_proba', 'priors', 'priors_', 'reg_param', 'rotations_', 'scalings_', 'score', 'set_params', 'store_covariance', 'tol']
[[ 0.04279022  0.03389409]
 [-0.03954635 -0.03132544]]
True


4.6.5

In [19]:
neigh = KNN(n_neighbors= 3)
KNN_fit = neigh.fit(x_train.iloc[:,1:3], y_train.iloc[:,1])
x_test_labels=KNN_fit.predict(x_test.iloc[:,1:3])
x_test_prob = KNN_fit.predict_proba(x_test.iloc[:,1:3]) 
print(np.mean(y_test.iloc[:,1] == x_test_labels))
print(dir(neigh))

0.5317460317460317
['__abstractmethods__', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_abc_impl', '_check_algorithm_metric', '_check_n_features', '_estimator_type', '_fit', '_fit_X', '_fit_method', '_get_param_names', '_get_tags', '_kneighbors_reduce_func', '_more_tags', '_pairwise', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_tree', '_validate_data', '_y', 'algorithm', 'classes_', 'effective_metric_', 'effective_metric_params_', 'fit', 'get_params', 'kneighbors', 'kneighbors_graph', 'leaf_size', 'metric', 'metric_params', 'n_features_in_', 'n_jobs', 'n_neighbors', 'n_samples_fit_', 'outputs_2d_', 'p', 'predict', 'predict_proba', 'radius', 'sco

In [20]:
Caravan = pd.read_csv('Caravan.csv', header=0)
print(Caravan.shape)
print(Caravan.head())
print(Caravan.describe())

(5822, 86)
   MOSTYPE  MAANTHUI  MGEMOMV  MGEMLEEF  MOSHOOFD  MGODRK  MGODPR  MGODOV  \
0       33         1        3         2         8       0       5       1   
1       37         1        2         2         8       1       4       1   
2       37         1        2         2         8       0       4       2   
3        9         1        3         3         3       2       3       2   
4       40         1        4         2        10       1       4       1   

   MGODGE  MRELGE  ...  APERSONG  AGEZONG  AWAOREG  ABRAND  AZEILPL  APLEZIER  \
0       3       7  ...         0        0        0       1        0         0   
1       4       6  ...         0        0        0       1        0         0   
2       4       3  ...         0        0        0       1        0         0   
3       4       5  ...         0        0        0       1        0         0   
4       4       7  ...         0        0        0       1        0         0   

   AFIETS  AINBOED  ABYSTAND  Purchase 

In [29]:
predict_label = pd.DataFrame(np.zeros(shape=(Caravan.shape[0],1)), columns = ['label'])
predict_label.iloc[Caravan['Purchase'] == 'Yes'] = 1
Caravan_drop = Caravan.drop(labels='Purchase', axis=1)
train_size = 1000
train_index = range(0, train_size)
x_validate = Caravan_drop.iloc[train_index, ]
y_validate = predict_label.iloc[train_index, ]
x_train = Caravan_drop.iloc[train_size:, ]
y_train = predict_label.iloc[train_size:, ]

x_train_scaled = preprocessing.scale(x_train)
scaler = preprocessing.StandardScaler().fit(x_train)
x_validate_scaled = scaler.transform(x_validate)

neigh = KNN(n_neighbors=1)
KNN_fit = neigh.fit(x_train_scaled, y_train.iloc[:,0])
x_validate_labels = KNN_fit.predict(x_validate_scaled)
x_validate_prob = KNN_fit.predict_proba(x_validate_scaled) 

print(np.mean(y_validate.iloc[:, 0] == x_validate_labels))
print(confusion_matrix(y_validate.iloc[:, 0], x_validate_labels))

0.883
[[874  67]
 [ 50   9]]
