In [52]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec, summarize)

from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
    (LinearDiscriminantAnalysis as LDA,
     QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [53]:
dataset = load_data("Smarket")
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1250 entries, 0 to 1249
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   Year       1250 non-null   int64   
 1   Lag1       1250 non-null   float64 
 2   Lag2       1250 non-null   float64 
 3   Lag3       1250 non-null   float64 
 4   Lag4       1250 non-null   float64 
 5   Lag5       1250 non-null   float64 
 6   Volume     1250 non-null   float64 
 7   Today      1250 non-null   float64 
 8   Direction  1250 non-null   category
dtypes: category(1), float64(7), int64(1)
memory usage: 79.6 KB


In [54]:
dataset.corr(numeric_only=True)

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
Year,1.0,0.0297,0.030596,0.033195,0.035689,0.029788,0.539006,0.030095
Lag1,0.0297,1.0,-0.026294,-0.010803,-0.002986,-0.005675,0.04091,-0.026155
Lag2,0.030596,-0.026294,1.0,-0.025897,-0.010854,-0.003558,-0.043383,-0.01025
Lag3,0.033195,-0.010803,-0.025897,1.0,-0.024051,-0.018808,-0.041824,-0.002448
Lag4,0.035689,-0.002986,-0.010854,-0.024051,1.0,-0.027084,-0.048414,-0.0069
Lag5,0.029788,-0.005675,-0.003558,-0.018808,-0.027084,1.0,-0.022002,-0.03486
Volume,0.539006,0.04091,-0.043383,-0.041824,-0.048414,-0.022002,1.0,0.014592
Today,0.030095,-0.026155,-0.01025,-0.002448,-0.0069,-0.03486,0.014592,1.0


In [55]:
dataset.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.01,1.1913,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.2965,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.4112,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.276,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.2057,0.213,Up


In [56]:
allvars = dataset.columns.drop(["Today", "Direction", "Year"])
design = ModelSpec(allvars)
X = design.fit_transform(dataset)
Y = dataset.Direction == "Up"
glm = sm.GLM(Y, X, family=sm.families.Binomial())
results = glm.fit()
results.summary()

0,1,2,3
Dep. Variable:,Direction,No. Observations:,1250.0
Model:,GLM,Df Residuals:,1243.0
Model Family:,Binomial,Df Model:,6.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-863.79
Date:,"Sat, 01 Nov 2025",Deviance:,1727.6
Time:,17:59:06,Pearson chi2:,1250.0
No. Iterations:,4,Pseudo R-squ. (CS):,0.002868
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
intercept,-0.1260,0.241,-0.523,0.601,-0.598,0.346
Lag1,-0.0731,0.050,-1.457,0.145,-0.171,0.025
Lag2,-0.0423,0.050,-0.845,0.398,-0.140,0.056
Lag3,0.0111,0.050,0.222,0.824,-0.087,0.109
Lag4,0.0094,0.050,0.187,0.851,-0.089,0.107
Lag5,0.0103,0.050,0.208,0.835,-0.087,0.107
Volume,0.1354,0.158,0.855,0.392,-0.175,0.446


In [57]:
probs = results.predict()
labels = np.array(["Down"] * dataset.shape[0])
labels[probs > 0.5] = "Up"
confusion_table(labels, dataset.Direction)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,145,141
Up,457,507


In [58]:
train = dataset.Year < 2005
train_dataset = dataset.loc[train]
test_dataset = dataset.loc[~train]
test_dataset.shape

(252, 9)

In [59]:
X_train, X_test = X.loc[train], X.loc[~train]
Y_train, Y_test = Y.loc[train], Y.loc[~train]
glm_train = sm.GLM(Y_train, X_train, family=sm.families.Binomial())
results = glm_train.fit()
probs = results.predict(exog=X_test)
probs.shape

(252,)

In [60]:
L_train, L_test = dataset.Direction.loc[train], dataset.Direction.loc[~train]
labels = np.array(["Down"] * test_dataset.shape[0])
labels[probs > 0.5] = "Up"
confusion_table(labels, test_dataset.Direction)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,77,97
Up,34,44


In [61]:
accuracy = np.mean(labels == L_test)
accuracy

0.4801587301587302

In [62]:
model = ModelSpec(["Lag1", "Lag2"]).fit(dataset)
X = model.transform(dataset)
X_train, X_test = X.loc[train], X.loc[~train]
glm_train = sm.GLM(Y_train, X_train, family=sm.families.Binomial())
results = glm_train.fit()
probs = results.predict(exog=X_test)
labels = np.array(["Down"] * test_dataset.shape[0])
labels[probs > 0.5] = "Up"
accuracy = np.mean(labels == L_test)
print(f"{accuracy=:.2f}")
confusion_table(labels, L_test)

accuracy=0.56


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,35,35
Up,76,106


In [63]:
newdata = pd.DataFrame({"Lag1": [1.2, 1.5], "Lag2": [1.1, -0.8]})
newX = model.transform(newdata)
results.predict(newX)

0    0.479146
1    0.496094
dtype: float64

In [64]:
lda = LDA(store_covariance=True)

In [65]:
X_train, X_test = [M.drop(columns=["intercept"]) for M in [X_train, X_test]]
lda.fit(X_train, L_train)

In [66]:
lda.means_

array([[ 0.04279022,  0.03389409],
       [-0.03954635, -0.03132544]])

In [68]:
lda_pred = lda.predict(X_test)
confusion_table(lda_pred, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,35,35
Up,76,106


In [72]:
lda.classes_

array(['Down', 'Up'], dtype='<U4')

In [70]:
lda_prob = lda.predict_proba(X_test)
np.all(np.where(lda_prob[:, 1] >= 0.5, "Up", "Down") == lda_pred)

True

In [71]:
np.sum(lda_prob[:, 0] > 0.9)

0

In [73]:
qda = QDA(store_covariance=True)
qda.fit(X_train, L_train)

In [74]:
qda.means_, qda.priors_

(array([[ 0.04279022,  0.03389409],
        [-0.03954635, -0.03132544]]),
 array([0.49198397, 0.50801603]))

In [75]:
qda.covariance_[0]

array([[ 1.50662277, -0.03924806],
       [-0.03924806,  1.53559498]])

In [77]:
qda_pred = qda.predict(X_test)
accuracy = np.mean(qda_pred == L_test)
print(f"{accuracy=:.2f}")
confusion_table(qda_pred, L_test)

accuracy=0.60


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,30,20
Up,81,121


In [79]:
NB = GaussianNB()
NB.fit(X_train, L_train)

In [80]:
NB.classes_

array(['Down', 'Up'], dtype='<U4')

In [81]:
nb_labels = NB.predict(X_test)
accuracy = np.mean(nb_labels == L_test)
print(f"{accuracy=:.2f}")
confusion_table(nb_labels, L_test)

accuracy=0.60


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,29,20
Up,82,121


In [82]:
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train, L_train)
knn1_pred = knn1.predict(X_test)
accuracy = np.mean(knn1_pred == L_test)
print(f"{accuracy=:.2f}")
confusion_table(knn1_pred, L_test)

accuracy=0.50


Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,43,58
Up,68,83


In [83]:
Caravan_dataset = load_data("Caravan")
Caravan_dataset.Purchase.value_counts()

Purchase
No     5474
Yes     348
Name: count, dtype: int64

In [84]:
feature_df = Caravan_dataset.drop(columns=["Purchase"])

In [85]:
scaler = StandardScaler(with_mean=True, with_std=True, copy=True)

In [88]:
scaler.fit(feature_df)
X_std = scaler.transform(feature_df)
feature_std = pd.DataFrame(X_std, columns=feature_df.columns)
feature_std.std()

MOSTYPE     1.000086
MAANTHUI    1.000086
MGEMOMV     1.000086
MGEMLEEF    1.000086
MOSHOOFD    1.000086
              ...   
AZEILPL     1.000086
APLEZIER    1.000086
AFIETS      1.000086
AINBOED     1.000086
ABYSTAND    1.000086
Length: 85, dtype: float64

In [89]:
(X_train,
 X_test,
 y_train,
 y_test) = train_test_split(feature_std,
                            Caravan_dataset.Purchase,
                            test_size=1000,
                            random_state=0)

In [90]:
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1_pred = knn1.fit(X_train, y_train).predict(X_test)
np.mean(y_test != knn1_pred), np.mean(y_test != "No")

(0.111, 0.067)

In [92]:
for K in range(1,6):
    knn = KNeighborsClassifier(n_neighbors=K)
    knn_pred = knn.fit(X_train, y_train).predict(X_test)
    C = confusion_table(knn_pred, y_test)
    templ = ('K={0:d}: # predicted to rent: {1:>2},' +
             ' # who did rent {2:d}, accuracy {3:.1%}')
    pred = C.loc['Yes'].sum()
    did_rent = C.loc['Yes','Yes']
    print(templ.format(
        K,
        pred,
        did_rent,
        did_rent / pred))

K=1: # predicted to rent: 62, # who did rent 9, accuracy 14.5%
K=2: # predicted to rent:  6, # who did rent 1, accuracy 16.7%
K=3: # predicted to rent: 20, # who did rent 3, accuracy 15.0%
K=4: # predicted to rent:  4, # who did rent 0, accuracy 0.0%
K=5: # predicted to rent:  7, # who did rent 1, accuracy 14.3%


In [93]:
logit = LogisticRegression(C=1e10, solver='liblinear')
logit.fit(X_train, y_train)
logit_pred = logit.predict_proba(X_test)
logit_labels = np.where(logit_pred[:,1] > 5, 'Yes', 'No')
confusion_table(logit_labels, y_test)

Truth,No,Yes
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,933,67
Yes,0,0


In [94]:
logit_labels = np.where(logit_pred[:,1]>0.25, 'Yes', 'No')
confusion_table(logit_labels, y_test)

Truth,No,Yes
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
No,913,58
Yes,20,9


In [96]:
Bike = load_data('Bikeshare')
Bike.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8645 entries, 0 to 8644
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   season      8645 non-null   int64   
 1   mnth        8645 non-null   category
 2   day         8645 non-null   int64   
 3   hr          8645 non-null   category
 4   holiday     8645 non-null   int64   
 5   weekday     8645 non-null   int64   
 6   workingday  8645 non-null   int64   
 7   weathersit  8645 non-null   category
 8   temp        8645 non-null   float64 
 9   atemp       8645 non-null   float64 
 10  hum         8645 non-null   float64 
 11  windspeed   8645 non-null   float64 
 12  casual      8645 non-null   int64   
 13  registered  8645 non-null   int64   
 14  bikers      8645 non-null   int64   
dtypes: category(3), float64(4), int64(8)
memory usage: 836.6 KB


In [98]:
X = ModelSpec(['mnth',
          'hr',
          'workingday',
          'temp',
          'weathersit']).fit_transform(Bike)
Y = Bike['bikers']
M_lm = sm.OLS(Y, X).fit()
M_lm.summary()

0,1,2,3
Dep. Variable:,bikers,R-squared:,0.675
Model:,OLS,Adj. R-squared:,0.673
Method:,Least Squares,F-statistic:,457.3
Date:,"Sat, 01 Nov 2025",Prob (F-statistic):,0.0
Time:,18:35:07,Log-Likelihood:,-49743.0
No. Observations:,8645,AIC:,99570.0
Df Residuals:,8605,BIC:,99850.0
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
intercept,-68.6317,5.307,-12.932,0.000,-79.035,-58.229
mnth[Feb],6.8452,4.287,1.597,0.110,-1.559,15.250
mnth[March],16.5514,4.301,3.848,0.000,8.120,24.983
mnth[April],41.4249,4.972,8.331,0.000,31.678,51.172
mnth[May],72.5571,5.641,12.862,0.000,61.499,83.615
mnth[June],67.8187,6.544,10.364,0.000,54.992,80.646
mnth[July],45.3245,7.081,6.401,0.000,31.444,59.205
mnth[Aug],53.2430,6.640,8.019,0.000,40.227,66.259
mnth[Sept],66.6783,5.925,11.254,0.000,55.064,78.293

0,1,2,3
Omnibus:,288.526,Durbin-Watson:,0.519
Prob(Omnibus):,0.0,Jarque-Bera (JB):,518.512
Skew:,0.273,Prob(JB):,2.55e-113
Kurtosis:,4.068,Cond. No.,131.0
