In [1]:
from sklearn import __version__ as sklearn_version
from distutils.version import LooseVersion
if LooseVersion(sklearn_version) < LooseVersion('0.18'):
    raise ValueError('Please use scikit-learn 0.18 or newer') 
from IPython.display import Image
%matplotlib inline

# Data preprocessing 

Import libraries and data, select a small sample

In [13]:
from sklearn import datasets
import numpy as np
import pandas as pd

bank_origin = pd.read_csv('bank.csv')
bank = bank_origin.sample(n = 400)

In [14]:
bank = bank.replace({'education':'unknown','job':'unknown'},np.nan)
bank = bank.dropna()
X = bank.iloc[:,[0,1,3,5]]
y = bank.y

Delete the sample with 'unknown' value
Choose features for the model

In [88]:
#bank = bank[~(bank.education=='unknown')][~(bank.job=='unknown')]

 
#h = bank[['age','job','education']]
#X.isnull().sum()
#y.isnull().sum()
#X['education'].unique()
#y.unique()
#X = X[~(X.education=='unknown')]

processing the ordinal and nominal data

In [15]:
edu_mapping = {
    'primary':1,
    'secondary':2,
    'tertiary':3
}
X['education']=X['education'].map(edu_mapping)
X = pd.get_dummies(X[['age','job','education','balance']],drop_first=True) # the first type of job is admin

from sklearn.preprocessing import LabelEncoder
no_le = LabelEncoder()
y = no_le.fit_transform(y.values)    # 0 means 'no', 1 means 'yes'


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Standardizing the features mannully

In [16]:
X['age']=(X['age']-X['age'].mean())/X['age'].std()
X['balance']=(X['balance']-X['balance'].mean())/X['balance'].std()
X['education']=(X['education']-X['education'].mean())/X['education'].std()

Splitting data into 70% training and 30% test data:

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y)

## Import plot functions

In [18]:
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt


def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):

    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
    Z = Z.reshape(xx1.shape)
    plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], 
                    y=X[y == cl, 1],
                    alpha=0.8, 
                    c=colors[idx],
                    marker=markers[idx], 
                    label=cl, 
                    edgecolor='black')

    # highlight test samples
    if test_idx:
        # plot all samples
        X_test, y_test = X[test_idx, :], y[test_idx]

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    edgecolor='black',
                    alpha=1.0,
                    linewidth=1,
                    marker='o',
                    s=100, 
                    label='test set')

## perceptron

In [24]:
from sklearn.linear_model import Perceptron

ppn = Perceptron(n_iter=40, eta0=0.1, random_state=1)
ppn.fit(X_train, y_train)
print('accuracy of trainning = ' ,ppn.score(X_train,y_train))
print('accuracy of testing = ' ,ppn.score(X_test,y_test))

X_plot_train = X_train.loc[:,['education','balance']]
X_plot_test = X_test.loc[:,['education','balance']]
X_combined = np.vstack((X_plot_train, X_plot_test))
y_combined = np.hstack((y_train, y_test))

plot_decision_regions(X_combined, y_combined,classifier=ppn, test_idx=range(105, 150))
#plt.xlabel('petal length [standardized]')
#plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')

plt.tight_layout()
#plt.savefig('images/03_01.png', dpi=300)
plt.show()

('accuracy of trainning = ', 0.8656716417910447)
('accuracy of testing = ', 0.8620689655172413)




ValueError: X has 2 features per sample; expecting 13

ValueError: X has 2 features per sample; expecting 13

## logistic regression model
train the model and test its accuracy

In [99]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(C=1,random_state=1)   # C=1~2 would produce best results
lr.fit(X_train, y_train)
print('accuracy of trainning = ' ,lr.score(X_train,y_train))
print('accuracy of testing = ' ,lr.score(X_test,y_test))

accuracy of trainning =  0.887640449438
accuracy of testing =  0.895652173913


In [100]:
plot_decision_regions(X_combined, y_combined,
                      classifier=lr)
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_06.png', dpi=300)
plt.show()

IndexError: index 1 is out of bounds for axis 1 with size 1

# Support vector machines

Dealing with the nonlinearly separable case using slack variables

In [122]:
from sklearn.svm import SVC

svm = SVC(kernel='linear', C=0.0001, random_state=1)
svm.fit(X_train, y_train)

print('accuracy of trainning = ' ,svm.score(X_train,y_train))
print('accuracy of testing = ' ,svm.score(X_test,y_test))

plot_decision_regions(X_combined, 
                      y_combined,
                      classifier=svm, 
                      test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_11.png', dpi=300)
plt.show()

accuracy of trainning =  0.89138576779
accuracy of testing =  0.895652173913


ValueError: X.shape[1] = 2 should be equal to 13, the number of features at training time

# Solving non-linear problems using a kernel SVM

## Using the kernel trick to find separating hyperplanes in higher dimensional space

In [123]:
svm = SVC(kernel='rbf', random_state=1, gamma=20, C=100)
svm.fit(X_train, y_train)

print('accuracy of trainning = ' ,svm.score(X_train,y_train))
print('accuracy of testing = ' ,svm.score(X_test,y_test))

plot_decision_regions(X_combined, y_combined, 
                      classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_16.png', dpi=300)
plt.show()

accuracy of trainning =  1.0
accuracy of testing =  0.878260869565


ValueError: X.shape[1] = 2 should be equal to 13, the number of features at training time

<br>
<br>

## Building a decision tree

In [125]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='gini', 
                              max_depth=3, 
                              random_state=1)
tree.fit(X_train, y_train)

print('accuracy of trainning = ' ,tree.score(X_train,y_train))
print('accuracy of testing = ' ,tree.score(X_test,y_test))

X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X_combined, y_combined, 
                      classifier=tree, test_idx=range(105, 150))

plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_20.png', dpi=300)
plt.show()

accuracy of trainning =  0.910112359551
accuracy of testing =  0.904347826087


ValueError: Number of features of the model must match the input. Model n_features is 13 and input n_features is 2 

<br>
<br>

## Combining weak to strong learners via random forests

In [127]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(criterion='gini',
                                n_estimators=30, 
                                random_state=1,
                                n_jobs=2)
forest.fit(X_train, y_train)

print('accuracy of trainning = ' ,forest.score(X_train,y_train))
print('accuracy of testing = ' ,forest.score(X_test,y_test))

plot_decision_regions(X_combined, y_combined, 
                      classifier=forest, test_idx=range(105, 150))

plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_22.png', dpi=300)
plt.show()



accuracy of trainning =  0.996254681648
accuracy of testing =  0.886956521739


ValueError: Number of features of the model must match the input. Model n_features is 13 and input n_features is 2 

<br>
<br>

# K-nearest neighbors - a lazy learning algorithm

In [129]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=30, 
                           p=2, 
                           metric='minkowski')
knn.fit(X_train, y_train)

print('accuracy of trainning = ' ,knn.score(X_train,y_train))
print('accuracy of testing = ' ,knn.score(X_test,y_test))

plot_decision_regions(X_combined, y_combined, 
                      classifier=knn, test_idx=range(105, 150))

plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
#plt.savefig('images/03_24.png', dpi=300)
plt.show()


accuracy of trainning =  0.89138576779
accuracy of testing =  0.895652173913


ValueError: query data dimension must match training data dimension

<br>
<br>