# Import libraries & CSV

In [None]:
# source: Aurelien Geron, Hands-On Machine Learning with Scikit-Learn, Keras & TensorFlow : Concepts, Tools
# and Techniques to Build Intelligent Systems
import pandas as pd
import numpy as np
%matplotlib inline 
import matplotlib.pyplot as plt

#### Variable Naming Convention/Abbreviation
*ADS* : Assignment Data Set \
*Cat* : Categorical \
*Num* : Numerical 

# Read CSV

In [None]:
ADS = pd.read_csv("dataset.csv")

In [95]:
ADS.tail(3)

Unnamed: 0,Gender,Age,Height(cm),Weight(kg),Family_history,Alcohol,Junk_food,Vege_day,Meals_day,Snack,Smoking,Water_intake(L),Transportation,Exercise,TV,Income,Discipline,Cardiovascular_risk(y)
2097,Male,19,176.0,79.0,yes,medium,yes,2,3,Frequently,no,3.0,bus,1,often,4347,no,medium
2098,Male,22,170.0,95.6,yes,none,yes,2,3,Sometimes,no,2.0,bus,0,rare,1376,no,high
2099,Female,37,166.7,81.0,yes,none,yes,2,3,Sometimes,no,1.65,car,0,moderate,9051,no,medium


# Overview of Dataset

In [None]:
ADS.info()

In [None]:
ADS.dtypes

#### Check for null values in Dataset
Finding:
There are no missing values

In [None]:
ADS.isnull().any()

# Check the qualitative data for each object data type
This can be helpful in validating the total column in training dataset after these values are encoded

In [None]:
print("Number of unique entry for each attribute:")
print('Col 1. ',ADS['Gender'].nunique())
print('Col 2. ',ADS['Family_history'].nunique())
print('Col 3. ',ADS['Alcohol'].nunique())
print('Col 4. ',ADS['Junk_food'].nunique())
print('Col 5. ',ADS['Snack'].nunique())
print('Col 6. ',ADS['Smoking'].nunique())
print('Col 7. ',ADS['Transportation'].nunique())
print('Col 8. ',ADS['TV'].nunique())
print('Col 9. ',ADS['Discipline'].nunique())
print('Col 10. ',ADS['Cardiovascular_risk(y)'].nunique())


### Separate input from output

In [144]:
ADS_input = ADS.drop('Cardiovascular_risk(y)', axis = 1)
print("Input Shape :" ,ADS_input.shape)
print("\nInput Data types:\n",ADS_input.dtypes)

Input Shape : (2100, 17)

Input Data types:
 Gender              object
Age                  int64
Height(cm)         float64
Weight(kg)         float64
Family_history      object
Alcohol             object
Junk_food           object
Vege_day             int64
Meals_day            int64
Snack               object
Smoking             object
Water_intake(L)    float64
Transportation      object
Exercise             int64
TV                  object
Income               int64
Discipline          object
dtype: object


In [147]:
ADS_output = ADS['Cardiovascular_risk(y)']
print("Output Shape :" ,ADS_output.shape)
print("\nOutput datatype:\n" ,ADS_output.dtypes)

Output Shape : (2100,)

Output datatype:
 object


# Preprocessing
### Training and Testing


In [143]:
from sklearn.model_selection import train_test_split
in_train, in_test, out_train, out_test = train_test_split(ADS_input, ADS_output, test_size=0.2,random_state=30)
print("in_train :", in_train.shape)
print("in_test :", in_test.shape)
print("out_train :", out_train.shape)
print("out_test :", out_test.shape)

in_train : (1680, 17)
in_test : (420, 17)
out_train : (1680,)
out_test : (420,)


### Separating Category and Numeric

In [None]:
in_train_Num = in_train.drop(columns = ['Gender','Family_history','Alcohol','Junk_food','Snack',
                                               'Smoking','Transportation','TV','Discipline'], inplace= False)
in_train_Cat = in_train[['Gender','Family_history','Alcohol','Junk_food','Snack',
                                               'Smoking','Transportation','TV','Discipline']]

### Visualize Numerical Datas

In [None]:
fs = plt.figure(figsize=(10,10))
ax = fs.gca()
in_train_Num.hist(bins = 20, ax = ax, grid= False, edgecolor='black', facecolor= 'xkcd:salmon')
plt.show()

### Standardize data

For numerical


In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
in_train_Num_scaled = scaler.fit_transform(in_train_Num)

For categorical,\
*Label Binarizer* and *Ordinal Encoder* does not apply

In [135]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
in_train_Cat_Encoded = encoder.fit_transform(in_train_Cat)
x = [iter for iter in range(10,36)]
in_train_Cat_Encoded = pd.DataFrame(in_train_Cat_Encoded.toarray(), columns= x)

### Check encoded categorical values

In [None]:
in_train_Cat_Encoded

Combining Categorical and Numerical datas

In [140]:
in_train_complete = np.hstack([in_train_Num_scaled, in_train_Cat_Encoded])
print("Shape of final set (rows, columns) : ", in_train_complete.shape)

Shape of final set (rows, columns) :  (1680, 34)


# Voila, Done Preprocessing 
#### Footnote (When training your model) #by Bester
From split result of `in_train` produces 
1. `in_train_Num`
2. `in_train_Cat`

(Check for any mistake)

The final training set will be `in_train_complete`\


# References

1. [Category data](https://www.datacamp.com/tutorial/categorical-data )
2. [Counting unique values in a column in pandas dataframe like in Qlik?](https://stackoverflow.com/questions/45759966/counting-unique-values-in-a-column-in-pandas-dataframe-like-in-qlik)
3. [Dataframe Drop Column in Pandas – How to Remove Columns from Dataframes](https://www.freecodecamp.org/news/dataframe-drop-column-in-pandas-how-to-remove-columns-from-dataframes/)
4. [How to Change the Figure Size of a Pandas Histogram](https://www.statology.org/pandas-histogram-size/)
5. [Specifying Colors](https://matplotlib.org/3.1.1/tutorials/colors/colors.html)
6. [How to perform one hot encoding on multiple categorical columns](https://datascience.stackexchange.com/questions/71804/how-to-perform-one-hot-encoding-on-multiple-categorical-columns)