# Comparison of classifiers

Import all the packages that we need

In [ ]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

from matplotlib import pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis as QDA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import scale

%matplotlib inline
plt.style.use('ggplot') # emulate pretty r-style plots

np.set_printoptions(precision=4) #print numpy arrays with precision 4

Read the data from Weekly.csv. The data set Weekly contains 1089 weekly returns of a stock index for 21 years, from the beginning of 1990 to the end of 2010. Lags refer to the percentage returns for each of the five previous periods.

In [ ]:
df = pd.read_csv('./Data/Weekly.csv')
print('Weekly dataframe shape =', df.shape)
df.head()

We are interested in the relationship between each of the predictors (lags and volume) with the market direction. 

a) Compute the matrix of correlations between the variables and comment on the output and on what to expect from regression models.

In [ ]:
correlations = df.corr(method='pearson', numeric_only=True)
print(correlations)

b) Plot the correlation between year and volume, today's return and lag1 and today's return and lag2.

In [ ]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18,4))
ax1.scatter(df.Year.values, df.Volume, facecolor = 'none', edgecolor = 'b')
ax1.set_xlabel('Year')
ax1.set_ylabel('Volume')

ax2.scatter(df.Lag1.values, df.Today, facecolor = 'none', edgecolor = 'b')
ax2.set_xlabel('Lag1')
ax2.set_ylabel('Today\'s return')

ax3.scatter(df.Lag2.values, df.Today, facecolor = 'none', edgecolor = 'b')
ax3.set_xlabel('Lag2')
ax3.set_ylabel('Today\'s return')

c) Perform a logistic regression with the market direction as response and all the lags and volume as predictors. To do this you need to convert the direction into a binary value ('up'=1 and 'down'=0). Comment on the output.

In [ ]:
X = sm.add_constant(df[df.columns[1:7]])
y = np.array([1 if direction=='Up' else 0 for direction in df.Direction.values])

logit = sm.Logit(y,X)
results = logit.fit()
print(results.summary())

d) Predict the results for the full data set and compute the confusion matrix. Comment on the confusion matrix.

In [ ]:
y_predicted = results.predict(X)
y_predicted = np.array(y_predicted > 0.5, dtype=float)

table = np.histogram2d(y_predicted, y, bins=2)[0]
print(table)

print('Confusion matrix')
print(pd.DataFrame(table, ['Down', 'Up'], ['Down', 'Up']))
print('\n')
print('Error rate=', 1-(table[0,0]+table[1,1])/np.sum(table))

e) Split the data set into a training and a test set. The training data will be the data drom the years 1990 through 2008 and the testing data will be from 2009 through 2010. Refit the logistic regression from task c) using 'Lag2' as the only predictor. Predict the results for the test set and build the confusion matrix. Comment on the confusion matrix.

In [ ]:
# Training-Set:
X_train = sm.add_constant(df[df.Year <= 2008].Lag2)
response_train = df[df.Year <= 2008].Direction
y_train = np.array([1 if el=='Up' else 0 for el in response_train])
# Test-Set:
X_test = sm.add_constant(df[df.Year > 2008].Lag2)
response_test = df[df.Year > 2008].Direction
y_test = np.array([1 if el=='Up' else 0 for el in response_test])

logit = sm.Logit(y_train, X_train)
results = logit.fit()
print(results.summary())
print('\n')

y_predicted = results.predict(X_test)
y_predicted = np.array(y_predicted > 0.5, dtype=float)

table = np.histogram2d(y_predicted, y_test, bins=2)[0]
print(table)

print('Confusion matrix')
print(pd.DataFrame(table, ['Down', 'Up'], ['Down', 'Up']))
print('\n')
print('Error rate=', 1-(table[0,0]+table[1,1])/np.sum(table))

f) Predict the market direction using the linear discriminant analysis, again only with 'lag2' as predictor and the same training and test set as in task e). Print the priors P('Up') and P('Down') and the class means $\mu_{Up}$ and $\mu_{Down}$ and the variances (remember the variance is assumed to be the same for all classes in LDA). Predict the results for the test set and build the confusion matrix. Comment on the confusion matrix.

In [ ]:
# Create classifier
clf = LDA(solver = 'lsqr', store_covariance=True)

X_train = df[df.Year <= 2008].Lag2.values
X_train = X_train.reshape((len(X_train), 1))

X_test = df[df.Year > 2008].Lag2.values
X_test = X_test.reshape((len(X_test), 1))

clf.fit(X_train, y_train)
print('Priors = ', clf.priors_)
print('Class means = ', clf.means_[0], clf.means_[1])
print('Variance = ', clf.covariance_)
print('\n')

y_predicted = clf.predict(X_test)
y_predicted = np.array(y_predicted > 0.5, dtype=float)

table = np.histogram2d(y_predicted, y_test, bins=2)[0]
print(table)

print('Confusion matrix')
print(pd.DataFrame(table, ['Down', 'Up'], ['Down', 'Up']))
print('\n')
print('Error rate=', 1-(table[0,0]+table[1,1])/np.sum(table))

g) Predict the market direction using the quadratic discriminant analysis, again only with 'lag2' as predictor and the same training and test set as in task e). Print the priors P('Up') and P('Down') and the class means $\mu_{Up}$ and $\mu_{Down}$ and the variances. Predict the results for the test set and build the confusion matrix. Comment on the confusion matrix.

In [ ]:
clf = QDA(store_covariance=True)

clf.fit(X_train, y_train)
print('Priors = ', clf.priors_)
print('Class means = ', clf.means_[0], clf.means_[1])
print('Variance = ', clf.covariance_)
print('\n')

y_predicted = clf.predict(X_test)
y_predicted = np.array(y_predicted > 0.5, dtype=float)

table = np.histogram2d(y_predicted, y_test, bins=2)[0]
print(table)

print('Confusion matrix')
print(pd.DataFrame(table, ['Down', 'Up'], ['Down', 'Up']))
print('\n')
print('Error rate=', 1-(table[0,0]+table[1,1])/np.sum(table))

h) Predict the market direction using the $k$-nearest neighbor analysis, again only with 'lag2' as predictor and the same training and test set as in task e). Try to experiment with different values for $k$.Predict the results for the test set and build the confusion matrix. Comment on the confusion matrix.

In [ ]:
clf = KNeighborsClassifier(n_neighbors=20)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)

table = np.histogram2d(y_predicted, y_test, bins=2)[0]
# print(table)

print('Confusion matrix')
print(pd.DataFrame(table, ['Down', 'Up'], ['Down', 'Up']))
print('\n')
print('Error rate=', 1-(table[0,0]+table[1,1])/np.sum(table))

i) Predict the market direction using a logistic regression again, but this time using 'lag1', 'lag2' and 'lag3' as predictors and the same training and test set as in task e). Predict the results for the test set and build the confusion matrix. Comment on the confusion matrix.

In [ ]:
predictors = df.columns[1:4]
X_train = sm.add_constant(df[df.Year <= 2008][predictors])
response_train = df[df.Year <= 2008].Direction
y_train = np.array([1 if el=='Up' else 0 for el in response_train])

X_test = sm.add_constant(df[df.Year > 2008][predictors])
response_test = df[df.Year > 2008].Direction
y_test = np.array([1 if el=='Up' else 0 for el in response_test])

logit = sm.Logit(y_train, X_train)
results = logit.fit()
print(results.summary())
print('\n')

y_predicted = results.predict(X_test)
y_predicted = np.array(y_predicted > 0.5, dtype=float)

table = np.histogram2d(y_predicted, y_test, bins=2)[0]
# print(table)

print('Confusion matrix')
print(pd.DataFrame(table, ['Down', 'Up'], ['Down', 'Up']))
print('\n')
print('Error rate=', 1-(table[0,0]+table[1,1])/np.sum(table))

j) Predict the market direction using a logistic regression again, but this time using 'lag1', 'lag2' and the interaction between 'lag1' and 'lag2' as predictors and the same training and test set as in task e). Predict the results for the test set and build the confusion matrix. Comment on the confusion matrix.

In [ ]:
df['Lag1*Lag2'] = pd.Series(df.Lag1*df.Lag2, index=df.index)

predictors = ['Lag1', 'Lag2', 'Lag1*Lag2']
X_train = sm.add_constant(df[df.Year <= 2008][predictors])
response_train = df[df.Year <= 2008].Direction
y_train = np.array([1 if el=='Up' else 0 for el in response_train])

X_test = sm.add_constant(df[df.Year > 2008][predictors])
response_test = df[df.Year > 2008].Direction
y_test = np.array([1 if el=='Up' else 0 for el in response_test])

logit = sm.Logit(y_train, X_train)
results = logit.fit()
print(results.summary())
print('\n')

y_predicted = results.predict(X_test)
y_predicted = np.array(y_predicted > 0.5, dtype=float)

table = np.histogram2d(y_predicted, y_test, bins=2)[0]
# print(table)

print('Confusion matrix')
print(pd.DataFrame(table, ['Down', 'Up'], ['Down', 'Up']))
print('\n')
print('Error rate=', 1-(table[0,0]+table[1,1])/np.sum(table))

No improvements. Error rate still 42%. Confusion matrix: Lag1 still a bad predictor. Best results obtained from only using Lag2 as a predictor (see Task h))