In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.utils import resample

### (a) Fit a logistic regressionmodel that predicts Direction using Lag1 and Lag2.

In [3]:
weekly = sm.datasets.get_rdataset('Weekly', 'ISLR')
print(weekly.__doc__)

Weekly R Documentation

Weekly S&P Stock Market Data
----------------------------

Description
~~~~~~~~~~~

Weekly percentage returns for the S&P 500 stock index between 1990 and
2010.

Usage
~~~~~

::

   Weekly

Format
~~~~~~

A data frame with 1089 observations on the following 9 variables.

``Year``
   The year that the observation was recorded

``Lag1``
   Percentage return for previous week

``Lag2``
   Percentage return for 2 weeks previous

``Lag3``
   Percentage return for 3 weeks previous

``Lag4``
   Percentage return for 4 weeks previous

``Lag5``
   Percentage return for 5 weeks previous

``Volume``
   Volume of shares traded (average number of daily shares traded in
   billions)

``Today``
   Percentage return for this week

``Direction``
   A factor with levels ``Down`` and ``Up`` indicating whether the
   market had a positive or negative return on a given week

Source
~~~~~~

Raw values of the S&P 500 were obtained from Yahoo Finance and then
converted to percentages a

In [4]:
# data
dweekly = weekly.data
dweekly.head()

Unnamed: 0,Year,Lag1,Lag2,Lag3,Lag4,Lag5,Volume,Today,Direction
0,1990,0.816,1.572,-3.936,-0.229,-3.484,0.154976,-0.27,Down
1,1990,-0.27,0.816,1.572,-3.936,-0.229,0.148574,-2.576,Down
2,1990,-2.576,-0.27,0.816,1.572,-3.936,0.159837,3.514,Up
3,1990,3.514,-2.576,-0.27,0.816,1.572,0.16163,0.712,Up
4,1990,0.712,3.514,-2.576,-0.27,0.816,0.153728,1.178,Up


In [5]:
X = dweekly[['Lag1', 'Lag2']]
y = dweekly['Direction']

lr = LogisticRegression()
lr.fit(X,y)
pred = lr.predict(X)
print('Train accuracy is ',accuracy_score(pred,y))

Train accuracy is  0.5546372819100092


### (b) Fit a logistic regressionmodel that predicts Direction using Lag1 and Lag2 using all but the first observation.

In [6]:
X = dweekly[['Lag1', 'Lag2']][1:]
y = dweekly['Direction'][1:]

lr = LogisticRegression()
lr.fit(X,y)

LogisticRegression()

### (c) Use the model from (b) to predict the direction of the first observation. You can do this by predicting that the first observation will go up if P(Direction="Up"|Lag1, Lag2) > 0.5. Was this observation correctly classified?

In [7]:
pred = lr.predict(dweekly[['Lag1', 'Lag2']].loc[[0]])
pred

array(['Up'], dtype=object)

### (d) Write a for loop from i = 1 to i = n, where n is the number of observations in the data set, that performs each of the following steps:

In [8]:
error = []
for i in range(1,len(dweekly)):
    X = dweekly[['Lag1', 'Lag2']].drop(i)
    y = dweekly['Direction'].drop(i)
    
    lr = LogisticRegression()
    lr.fit(X,y)
    pred = lr.predict(dweekly[['Lag1', 'Lag2']].loc[[i]])
    
    if pred == dweekly['Direction'].loc[i] :
        error.append(1)
    else:
        error.append(0)
    

In [9]:
error

[0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,


### (e) Take the average of the n numbers obtained in (d)iv in order to obtain the LOOCV estimate for the test error. Comment on the results.

In [10]:
print('LOOCV estimated error',np.mean(error))


LOOCV estimated error 0.5505514705882353


In [13]:
 dweekly[['Lag1']]

Unnamed: 0,Lag1
0,0.816
1,-0.270
2,-2.576
3,3.514
4,0.712
...,...
1084,-0.861
1085,2.969
1086,1.281
1087,0.283


In [20]:
type(dweekly['Direction'][1:])

pandas.core.series.Series