<!--NAVIGATION-->

<a href="https://colab.research.google.com/github/bpesquet/machine-learning-katas/blob/master/classic-datasets/Iris.ipynb"><img align="left" src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open in Google Colaboratory"></a>

# Boston Dataset


In [13]:
# Import needed packages
# You may add or remove packages should you need them
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import load_boston
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, LeaveOneOut, cross_val_score

# Set random seed
np.random.seed(0)

# Display plots inline and change plot resolution to retina
#%matplotlib inline
#%config InlineBackend.figure_format = 'retina'
# Set Seaborn aesthetic parameters to defaults
sns.set()

# Data

## Step 1: Loading the data

In [14]:
# Load the Iris dataset included with scikit-learn
dataset = load_boston()

# Put data in a pandas DataFrame
df_boston = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [15]:
# Add target and class to DataFrame
df_boston['target'] = dataset.target
# Show 10 random samples
df_boston.sample(n=10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
329,0.06724,0.0,3.24,0.0,0.46,6.333,17.2,5.2146,4.0,430.0,16.9,375.21,7.34,22.6
371,9.2323,0.0,18.1,0.0,0.631,6.216,100.0,1.1691,24.0,666.0,20.2,366.15,9.53,50.0
219,0.11425,0.0,13.89,1.0,0.55,6.373,92.4,3.3633,5.0,276.0,16.4,393.74,10.5,23.0
403,24.8017,0.0,18.1,0.0,0.693,5.349,96.0,1.7028,24.0,666.0,20.2,396.9,19.77,8.3
78,0.05646,0.0,12.83,0.0,0.437,6.232,53.7,5.0141,5.0,398.0,18.7,386.4,12.34,21.2
15,0.62739,0.0,8.14,0.0,0.538,5.834,56.5,4.4986,4.0,307.0,21.0,395.62,8.47,19.9
487,4.83567,0.0,18.1,0.0,0.583,5.905,53.2,3.1523,24.0,666.0,20.2,388.22,11.45,20.6
340,0.06151,0.0,5.19,0.0,0.515,5.968,58.5,4.8122,5.0,224.0,20.2,396.9,9.29,18.7
310,2.63548,0.0,9.9,0.0,0.544,4.973,37.8,2.5194,4.0,304.0,18.4,350.45,12.64,16.1
102,0.22876,0.0,8.56,0.0,0.52,6.405,85.4,2.7147,5.0,384.0,20.9,70.8,10.63,18.6


### Question
Find the X and y values we're looking for. Notice that y is categorical and thus, we could **one-hot encode it** if we are looking at **class** or we can just pick **target**. In order to one hot encode we have  to re-shape `y` it using the **.get_dummies** function. 


In [17]:
# YOUR CODE HERE
data_filt = df_boston.drop(["target"], axis=1)
X = data_filt
y = df_boston["target"]

y.head()

0    24.0
1    21.6
2    34.7
3    33.4
4    36.2
Name: target, dtype: float64

## Step 2: Normalize the data in X

In [18]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled

array([[-0.41978194,  0.28482986, -1.2879095 , ..., -1.45900038,
         0.44105193, -1.0755623 ],
       [-0.41733926, -0.48772236, -0.59338101, ..., -0.30309415,
         0.44105193, -0.49243937],
       [-0.41734159, -0.48772236, -0.59338101, ..., -0.30309415,
         0.39642699, -1.2087274 ],
       ...,
       [-0.41344658, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.98304761],
       [-0.40776407, -0.48772236,  0.11573841, ...,  1.17646583,
         0.4032249 , -0.86530163],
       [-0.41500016, -0.48772236,  0.11573841, ...,  1.17646583,
         0.44105193, -0.66905833]])

### Question

Is it better to store it on numpy or in an updated pandas dataframe? Numpy is more efficient while pandas is more visual. **pick whatever your prefer**

In [19]:
X = pd.DataFrame(X_scaled)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.419782,0.28483,-1.287909,-0.272599,-0.144217,0.413672,-0.120013,0.140214,-0.982843,-0.666608,-1.459,0.441052,-1.075562
1,-0.417339,-0.487722,-0.593381,-0.272599,-0.740262,0.194274,0.367166,0.55716,-0.867883,-0.987329,-0.303094,0.441052,-0.492439
2,-0.417342,-0.487722,-0.593381,-0.272599,-0.740262,1.282714,-0.265812,0.55716,-0.867883,-0.987329,-0.303094,0.396427,-1.208727
3,-0.41675,-0.487722,-1.306878,-0.272599,-0.835284,1.016303,-0.809889,1.077737,-0.752922,-1.106115,0.113032,0.416163,-1.361517
4,-0.412482,-0.487722,-1.306878,-0.272599,-0.835284,1.228577,-0.51118,1.077737,-0.752922,-1.106115,0.113032,0.441052,-1.026501


## Step 3: Train / Test Split

Store training input data in a variable named `x_train` and training targets in a variable named `y_train` with an **80/20 train/test split**.

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.8, random_state=1)

In [22]:
print(f'X_train: {X_train.shape}. y_train: {y_train.shape}')
print(f'Labels: {y_train}')
assert X_train.shape == (404,13)
# only if we hot encode
# assert y_train.shape == (120,3)

X_train: (404, 13). y_train: (404,)
Labels: 42     25.3
58     23.3
385     7.2
78     21.2
424    11.7
160    27.0
185    29.6
101    26.5
268    43.5
173    23.6
428    11.0
306    33.4
258    36.0
192    36.4
339    19.0
18     20.2
40     34.9
267    50.0
328    19.3
454    14.9
41     26.6
361    19.9
289    24.8
498    21.2
293    23.9
502    20.6
172    23.1
80     28.0
46     20.0
318    23.1
       ... 
68     17.4
50     19.7
398     5.0
413    16.3
156    13.1
252    29.6
395    13.1
468    19.1
402    12.1
357    21.7
254    21.9
276    33.2
178    29.9
281    35.4
390    15.1
237    31.5
71     21.7
460    16.4
129    14.3
144    11.8
448    14.1
335    21.1
133    18.4
203    48.5
393    13.8
255    20.9
72     22.8
396    12.5
235    24.0
37     21.0
Name: target, Length: 404, dtype: float64


In [23]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(X_train))
print('Number of observations in the test data:',len(X_test))

Number of observations in the training data: 404
Number of observations in the test data: 102


## Step 4: Training a model

Train a model on the data to obtain a training accuracy > 93%. Store the training history in a variable named `history`.

For example, we can use **RandomForestRegressor** model to classify Irises. N_estimators increases model accuracy. Other options include:
- K-Nearest Neighbors
- Support Vector Machines 
- Decision Tree Classifiers
- Naive Bayes
- Linear Discriminant Analysis
- Logistic Regression


In [25]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [41]:
rfr= RandomForestRegressor(n_estimators = 10000).fit(X_train, y_train)

Now we can measure the accuracy with our **test set**

In [40]:
rfr.score(X_test, y_test)

0.9138368587356178

And now we can predict what a new input would look like...

In [35]:
rfr.predict([[20,10,5,7,8,10,11,12,6,9,3,2,1]])

array([15.93438])

can you build a table with each predicted value and its corresponding category in string?

In [48]:
from sklearn.neighbors import KNeighborsRegressor


In [49]:
neigh = KNeighborsRegressor(n_neighbors=2)
neigh.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                    weights='uniform')

In [50]:
neigh.score(X_test,y_test)

0.8696060356574932

In [52]:
from sklearn import svm


In [53]:
clf2 = svm.SVC(gamma='scale')
clf2.fit(X_train, y_train) 

ValueError: Unknown label type: 'continuous'