<!--NAVIGATION-->

<a href="https://colab.research.google.com/github/bpesquet/machine-learning-katas/blob/master/classic-datasets/Iris.ipynb"><img align="left" src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open in Colab" title="Open in Google Colaboratory"></a>

# Wine Dataset


In [1]:
# Import needed packages
# You may add or remove packages should you need them
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.datasets import load_wine
from sklearn import model_selection
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, LeaveOneOut, cross_val_score

# Set random seed
np.random.seed(0)

# Display plots inline and change plot resolution to retina
#%matplotlib inline
#%config InlineBackend.figure_format = 'retina'
# Set Seaborn aesthetic parameters to defaults
sns.set()

# Data

## Step 1: Loading the data

In [2]:
# Load the Iris dataset included with scikit-learn
dataset = load_wine()

# Put data in a pandas DataFrame
df_wine = pd.DataFrame(dataset.data, columns=dataset.feature_names)

In [3]:
# Add target and class to DataFrame
df_wine['target'] = dataset.target
df_wine['class'] = dataset.target_names[dataset.target]
# Show 10 random samples
df_wine.sample(n=10)

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target,class
54,13.74,1.67,2.25,16.4,118.0,2.6,2.9,0.21,1.62,5.85,0.92,3.2,1060.0,0,class_0
151,12.79,2.67,2.48,22.0,112.0,1.48,1.36,0.24,1.26,10.8,0.48,1.47,480.0,2,class_2
63,12.37,1.13,2.16,19.0,87.0,3.5,3.1,0.19,1.87,4.45,1.22,2.87,420.0,1,class_1
55,13.56,1.73,2.46,20.5,116.0,2.96,2.78,0.2,2.45,6.25,0.98,3.03,1120.0,0,class_0
123,13.05,5.8,2.13,21.5,86.0,2.62,2.65,0.3,2.01,2.6,0.73,3.1,380.0,1,class_1
121,11.56,2.05,3.23,28.5,119.0,3.18,5.08,0.47,1.87,6.0,0.93,3.69,465.0,1,class_1
7,14.06,2.15,2.61,17.6,121.0,2.6,2.51,0.31,1.25,5.05,1.06,3.58,1295.0,0,class_0
160,12.36,3.83,2.38,21.0,88.0,2.3,0.92,0.5,1.04,7.65,0.56,1.58,520.0,2,class_2
106,12.25,1.73,2.12,19.0,80.0,1.65,2.03,0.37,1.63,3.4,1.0,3.17,510.0,1,class_1
90,12.08,1.83,2.32,18.5,81.0,1.6,1.5,0.52,1.64,2.4,1.08,2.27,480.0,1,class_1


### Question
Find the X and y values we're looking for. Notice that y is categorical and thus, we could **one-hot encode it** if we are looking at **class** or we can just pick **target**. In order to one hot encode we have  to re-shape `y` it using the **.get_dummies** function. 


In [4]:
# YOUR CODE HERE
data_filt = df_wine.drop(["target", "class"], axis=1)
X = data_filt
y = df_wine["target"]

y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int32

## Step 2: Normalize the data in X

In [5]:
scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)
X_scaled

array([[ 1.51861254, -0.5622498 ,  0.23205254, ...,  0.36217728,
         1.84791957,  1.01300893],
       [ 0.24628963, -0.49941338, -0.82799632, ...,  0.40605066,
         1.1134493 ,  0.96524152],
       [ 0.19687903,  0.02123125,  1.10933436, ...,  0.31830389,
         0.78858745,  1.39514818],
       ...,
       [ 0.33275817,  1.74474449, -0.38935541, ..., -1.61212515,
        -1.48544548,  0.28057537],
       [ 0.20923168,  0.22769377,  0.01273209, ..., -1.56825176,
        -1.40069891,  0.29649784],
       [ 1.39508604,  1.58316512,  1.36520822, ..., -1.52437837,
        -1.42894777, -0.59516041]])

### Question

Is it better to store it on numpy or in an updated pandas dataframe? Numpy is more efficient while pandas is more visual. **pick whatever your prefer**

In [6]:
X = pd.DataFrame(X_scaled)
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,1.518613,-0.56225,0.232053,-1.169593,1.913905,0.808997,1.034819,-0.659563,1.224884,0.251717,0.362177,1.84792,1.013009
1,0.24629,-0.499413,-0.827996,-2.490847,0.018145,0.568648,0.733629,-0.820719,-0.544721,-0.293321,0.406051,1.113449,0.965242
2,0.196879,0.021231,1.109334,-0.268738,0.088358,0.808997,1.215533,-0.498407,2.135968,0.26902,0.318304,0.788587,1.395148
3,1.69155,-0.346811,0.487926,-0.809251,0.930918,2.491446,1.466525,-0.981875,1.032155,1.186068,-0.427544,1.184071,2.334574
4,0.2957,0.227694,1.840403,0.451946,1.281985,0.808997,0.663351,0.226796,0.401404,-0.319276,0.362177,0.449601,-0.037874


## Step 3: Train / Test Split

Store training input data in a variable named `x_train` and training targets in a variable named `y_train` with an **80/20 train/test split**.

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, train_size=0.8, random_state=1)

In [8]:
print(f'X_train: {X_train.shape}. y_train: {y_train.shape}')
print(f'Labels: {y_train}')
assert X_train.shape == (142,13)
# only if we hot encode
# assert y_train.shape == (120,3)

X_train: (142, 13). y_train: (142,)
Labels: 91     1
81     1
114    1
48     0
54     0
59     1
165    2
39     0
56     0
44     0
78     1
33     0
18     0
58     0
127    1
172    2
148    2
12     0
90     1
95     1
28     0
124    1
135    2
66     1
123    1
17     0
164    2
89     1
170    2
13     0
      ..
1      0
57     0
22     0
61     1
63     1
7      0
174    2
141    2
86     1
96     1
68     1
50     0
142    2
157    2
156    2
139    2
146    2
101    1
20     0
25     0
134    2
71     1
129    1
144    2
79     1
133    2
137    2
72     1
140    2
37     0
Name: target, Length: 142, dtype: int32


In [9]:
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(X_train))
print('Number of observations in the test data:',len(X_test))

Number of observations in the training data: 142
Number of observations in the test data: 36


## Step 4: Training a model

Train a model on the data to obtain a training accuracy > 93%. Store the training history in a variable named `history`.

For example, we can use **RandomForestClassifier** model to classify Irises. N_estimators increases model accuracy. Other options include:
- K-Nearest Neighbors
- Support Vector Machines
- Decision Tree Classifiers
- Naive Bayes
- Linear Discriminant Analysis
- Logistic Regression


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error

In [11]:
clf = RandomForestClassifier(n_estimators = 10000).fit(X_train, y_train)

Now we can measure the accuracy with our **test set**

In [12]:
clf.score(X_test, y_test)

0.9722222222222222

And now we can predict what a new input would look like...

In [13]:
clf.predict([[20,10,5,7,8,10,11,12,6,9,3,2,1]])

array([0])

can you build a table with each predicted value and its corresponding category in string?

In [14]:
prediction = clf.predict(X_test)

cat_pred = []

for e in prediction:
    if e == 0:
        cat_pred.append("class 0")
    elif e == 1:
        cat_pred.append("class 1")
    else:
        cat_pred.append("class 2")
        
for j in zip(prediction, cat_pred):
    print(j)

(2, 'class 2')
(1, 'class 1')
(0, 'class 0')
(1, 'class 1')
(0, 'class 0')
(2, 'class 2')
(1, 'class 1')
(0, 'class 0')
(2, 'class 2')
(1, 'class 1')
(0, 'class 0')
(0, 'class 0')
(1, 'class 1')
(0, 'class 0')
(1, 'class 1')
(1, 'class 1')
(2, 'class 2')
(0, 'class 0')
(1, 'class 1')
(0, 'class 0')
(0, 'class 0')
(1, 'class 1')
(2, 'class 2')
(0, 'class 0')
(0, 'class 0')
(2, 'class 2')
(0, 'class 0')
(0, 'class 0')
(0, 'class 0')
(2, 'class 2')
(1, 'class 1')
(2, 'class 2')
(2, 'class 2')
(0, 'class 0')
(1, 'class 1')
(1, 'class 1')
