In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib notebook

In [2]:
# Load the spam data
data = pd.read_csv('../data/spam.data',sep=' ',header=None)
train_test = pd.read_csv('../data/spam.traintest',header=None)

# Split in train/test
train_data = data[train_test.values==0]
test_data = data[train_test.values==1]

# Shuffle the data
train_data = train_data.sample(frac=1).reset_index(drop=True)
test_data = test_data.sample(frac=1).reset_index(drop=True)

In [3]:
X_train = train_data.iloc[:,0:56].values
y_train = train_data.iloc[:,57].values

X_test = test_data.iloc[:,0:56].values
y_test = test_data.iloc[:,57].values

In [4]:
# Standaraize data
from sklearn import preprocessing
X_train_scaled = preprocessing.scale(X_train)
X_test_scaled = preprocessing.scale(X_test)

In [5]:
# Using GridSearch
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm

# Applying PCA gives
p = X_train_scaled.shape[1]

fig,ax = plt.subplots()
# Now let's introduce PCA conponenents one -by-one
model = PLSRegression
scores = [cross_val_score(model(n_components=n), X_train_scaled, y_train, cv=5).mean()
        for n in np.arange(1,p+1)]
scores_std = [cross_val_score(model(n_components=n), X_train_scaled, y_train, cv=5).std()
        for n in np.arange(1,p+1)]

ax.errorbar(np.arange(1,p+1), scores,yerr=scores_std)

  from pandas.core import datetools


<IPython.core.display.Javascript object>

<ErrorbarContainer object of 3 artists>

### Model based on 1 standard deviation rule

In [6]:
ind_max = np.argmax(scores)
one_std_value = scores[ind_max]-scores_std[ind_max]
ind_final = np.argmax(scores>one_std_value)
n_components = np.arange(1,p+1)[ind_final]

In [10]:
fig,ax = plt.subplots()
ax.errorbar(np.arange(1,p+1), scores,yerr=scores_std)
ax.hlines(one_std_value,xmin=1,xmax=60)
ax.plot(ind_final,scores[ind_final],'ro')
ax.plot(ind_max,scores[ind_max],'go')

<IPython.core.display.Javascript object>

[<matplotlib.lines.Line2D at 0x7f57202f2b38>]

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,r2_score

pipe = Pipeline([('regression', PLSRegression())])

pipe.fit(X_train_scaled,y_train)

y_pred = np.array(pipe.predict(X_test_scaled) > 0.5, dtype=int)

# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Mean squared error: 0.13
Variance score: 0.44


### Model based on highest cross validation score

In [None]:
ind_max = np.argmax(scores)
n_components = np.arange(1,p+1)[ind_max]

pipe = Pipeline([('regression', PLSRegression())])

pipe.fit(X_train_scaled,y_train)

y_pred = np.array(pipe.predict(X_test_scaled) > 0.5, dtype=int)

# The mean squared error
print("Mean squared error: %.2f"
      % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))