In [3]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots 
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
summarize)

In [4]:
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
(LinearDiscriminantAnalysis as LDA, QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LogisticRegression
 

In [5]:
Smarket = load_data('Smarket') 
Smarket

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,2001,0.381,-0.192,-2.624,-1.055,5.010,1.19130,0.959,Up
1,2001,0.959,0.381,-0.192,-2.624,-1.055,1.29650,1.032,Up
2,2001,1.032,0.959,0.381,-0.192,-2.624,1.41120,-0.623,Down
3,2001,-0.623,1.032,0.959,0.381,-0.192,1.27600,0.614,Up
4,2001,0.614,-0.623,1.032,0.959,0.381,1.20570,0.213,Up
...,...,...,...,...,...,...,...,...,...
1245,2005,0.422,0.252,-0.024,-0.584,-0.285,1.88850,0.043,Up
1246,2005,0.043,0.422,0.252,-0.024,-0.584,1.28581,-0.955,Down
1247,2005,-0.955,0.043,0.422,0.252,-0.024,1.54047,0.130,Up
1248,2005,0.130,-0.955,0.043,0.422,0.252,1.42236,-0.298,Down


In [6]:
Smarket.columns

Index(['Year', 'Lag1', 'Lag2', 'Lag3', 'Lag4', 'Lag5', 'Volume', 'Today',
       'Direction'],
      dtype='object')

In [13]:
Smarket.iloc[:,:-1].corr()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today
Year,1.0,0.0297,0.030596,0.033195,0.035689,0.029788,0.539006,0.030095
Lag1,0.0297,1.0,-0.026294,-0.010803,-0.002986,-0.005675,0.04091,-0.026155
Lag2,0.030596,-0.026294,1.0,-0.025897,-0.010854,-0.003558,-0.043383,-0.01025
Lag3,0.033195,-0.010803,-0.025897,1.0,-0.024051,-0.018808,-0.041824,-0.002448
Lag4,0.035689,-0.002986,-0.010854,-0.024051,1.0,-0.027084,-0.048414,-0.0069
Lag5,0.029788,-0.005675,-0.003558,-0.018808,-0.027084,1.0,-0.022002,-0.03486
Volume,0.539006,0.04091,-0.043383,-0.041824,-0.048414,-0.022002,1.0,0.014592
Today,0.030095,-0.026155,-0.01025,-0.002448,-0.0069,-0.03486,0.014592,1.0


In [19]:
X = Smarket[["Lag1","Lag2","Lag3","Lag4","Lag5","Volume"]]
#这行代码创建了一个布尔型数组y，其中Smarket.Direction == 'Up'的结果为True，否则为False
#非常巧妙的方法，在logistic1回归中把y变成0，1
y = Smarket.Direction == 'Up'
#模型遵循的分布族为二项分布
glm = sm.GLM(y, X,family=sm.families.Binomial())
results = glm.fit()
#用GLM可以summarize（）
summarize(results)

Unnamed: 0,coef,std err,z,P>|z|
Lag1,-0.0721,0.05,-1.438,0.15
Lag2,-0.0434,0.05,-0.868,0.386
Lag3,0.01,0.05,0.2,0.842
Lag4,0.0081,0.05,0.162,0.872
Lag5,0.0097,0.049,0.196,0.845
Volume,0.0549,0.037,1.473,0.141


In [20]:
results.params

Lag1     -0.072063
Lag2     -0.043420
Lag3      0.009969
Lag4      0.008073
Lag5      0.009685
Volume    0.054905
dtype: float64

In [21]:
results.pvalues

Lag1      0.150490
Lag2      0.385502
Lag3      0.841611
Lag4      0.871503
Lag5      0.844859
Volume    0.140742
dtype: float64

In [22]:
#预测=1的概率
probs = results.predict() 
probs [:10]

array([0.51502968, 0.48805685, 0.48458173, 0.52021946, 0.51767349,
       0.51086624, 0.49521645, 0.51173353, 0.52523007, 0.49547214])

In [23]:
#设置阀值
labels = np.array(['Down']*1250) 
labels[probs>0.5] = "Up"

In [24]:
confusion_table(labels, Smarket.Direction)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,100,93
Up,502,555


In [27]:
#bool
train = (Smarket.Year < 2005)
Smarket_train = Smarket.loc[train]
#bool取反
Smarket_test = Smarket.loc[~train] 
Smarket_test.shape

(252, 9)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [34]:
X_train, X_test = X.loc[train], X.loc[~train] 
y_train, y_test = y.loc[train], y.loc[~train] 
glm_train = sm.GLM(y_train,X_train ,family=sm.families.Binomial()) 
results = glm_train.fit()
probs = results.predict(exog=X_test)

In [37]:
D = Smarket.Direction
L_train, L_test = D.loc[train], D.loc[~train]

In [38]:
labels = np.array(['Down']*252) 
labels[probs>0.5] = 'Up' 
confusion_table(labels, L_test)
 

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,33,26
Up,78,115


In [39]:
#删除不显著的变量
X = Smarket[["Lag1","Lag2"]]
X_train, X_test = X.loc[train], X.loc[~train] 
glm_train = sm.GLM(y_train,X_train ,family=sm.families.Binomial()) 
results = glm_train.fit()
probs = results.predict(exog=X_test) 
labels = np.array(['Down']*252) 
labels[probs>0.5] = 'Up' 
confusion_table(labels, L_test)

Truth,Down,Up
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
Down,64,67
Up,47,74


In [41]:
#预测两组新的数据
newdata = pd.DataFrame({'Lag1':[1.2, 1.5], 'Lag2':[1.1, -0.8]});
results.predict(newdata)

0    0.471132
1    0.488050
dtype: float64