In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from matplotlib import rcParams
import warnings


In [10]:
warnings.filterwarnings("ignore")

In [11]:
# figure size in inches
rcParams["figure.figsize"] = 10, 6
np.random.seed(42)

In [16]:
def info_about_columns(dataframe, data_science_descriptions):
    '''
        A reusable function that will create a dataframe to contain in another 
        dataframe the following : dataypes, Number of Unique Categories, Categories 
        per sample and the type of variable missing values and missing values %
        
        input : A dataframe where data and categories will be retrieved
        series : The data science explamation for each data type
    '''
        
    if data_science_descriptions == None:
        data_science_descriptions = dataframe.copy().dtypes
        data_science_descriptions = \
            data_science_descriptions.replace(data_science_descriptions.to_list(), "NA")
                 
    dataframe_info_about_columns = pd.concat([
          dataframe.dtypes, 
          dataframe.nunique(), 
          round(dataframe.nunique()*100/len(dataframe)),
          data_science_descriptions,
          dataframe.isna().sum(),
          dataframe.isna().sum() * 100 / len(dataframe)], axis=1)
    

    dataframe_info_about_columns.columns=[
                                     'DataType', 
                                     '# of Categories', 
                                     'categories/sample ratio', 
                                     'Data Science Type',
                                     'missing values',
                                     'missing values %']
    
    return dataframe_info_about_columns

In [17]:
from IPython.display import display, HTML

def show_examples_of_data(dataframe, data_information, catgeogry_cutoff):
    '''
       purpose To show the data to provide the data scientist an understand of the data
       
       input:
          dataframe          The data frame that contains the dataset
          data_information   Information about the categorical, missing values etc..
          descriptons        A series that contains a description for each filename
    '''
    
    data_dictionary = pd.DataFrame(columns=["Field", "Value"])
    
    for index, row in data_information.iterrows():
        
         values = ""
         column_cnt = data_information.loc[index, '# of Categories']
         if column_cnt <= catgeogry_cutoff:
                value = dataframe[index].unique()
         else: 
                value = str(dataframe[index].min()) + " to "  + str(dataframe[index].max())             
         row_data = []
         row_data.append(index)
         #row_data.append(description.loc[index])
         row_data.append(value)
    
         data_dictionary.loc[len(data_dictionary.index)] = row_data
                
    return data_dictionary


## Load Dataset

> * Load the prime_indians_diabetes dataset
> * Check the first five samples. Are there any categorical features?
> * Is this a classification or regression problem?
Check the columns of the dataset

In [32]:
original_df = pd.read_csv("pima_indians_diabetes.csv")
original_df.head(5)

Unnamed: 0,time_pregnant_no,plasma_concentration,diastolic_blood_pressure,triceps_skinfold_thickness,serum_insulin,bmi,diabetes_pedigree,age,class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [21]:
# Get Information about columns
info = info_about_columns(original_df, None)
info

Unnamed: 0,DataType,# of Categories,categories/sample ratio,Data Science Type,missing values,missing values %
time_pregnant_no,int64,17,2.0,,0,0.0
plasma_concentration,int64,136,18.0,,0,0.0
diastolic_blood_pressure,int64,47,6.0,,0,0.0
triceps_skinfold_thickness,int64,51,7.0,,0,0.0
serum_insulin,int64,186,24.0,,0,0.0
bmi,float64,248,32.0,,0,0.0
diabetes_pedigree,float64,517,67.0,,0,0.0
age,int64,52,7.0,,0,0.0
class,int64,2,0.0,,0,0.0


In [26]:
display(HTML(show_examples_of_data(original_df, info, 47).to_html()))

Unnamed: 0,Field,Value
0,time_pregnant_no,"[6, 1, 8, 0, 5, 3, 10, 2, 4, 7, 9, 11, 13, 15, 17, 12, 14]"
1,plasma_concentration,0 to 199
2,diastolic_blood_pressure,"[72, 66, 64, 40, 74, 50, 0, 70, 96, 92, 80, 60, 84, 30, 88, 90, 94, 76, 82, 75, 58, 78, 68, 110, 56, 62, 85, 86, 48, 44, 65, 108, 55, 122, 54, 52, 98, 104, 95, 46, 102, 100, 61, 24, 38, 106, 114]"
3,triceps_skinfold_thickness,0 to 99
4,serum_insulin,0 to 846
5,bmi,0.0 to 67.1
6,diabetes_pedigree,0.078 to 2.42
7,age,21 to 81
8,class,"[1, 0]"


#### Check the first five samples. Are there any categorical features?
The categorical features is the dependent variables : class

In [36]:
# Check the columns of the dataset to find how many missing values 
# they have.
original_df.eq(0).sum(axis=0)

time_pregnant_no              111
plasma_concentration            5
diastolic_blood_pressure       35
triceps_skinfold_thickness    227
serum_insulin                 374
bmi                            11
diabetes_pedigree               0
age                             0
class                         500
dtype: int64

2. Data Preprocessing and Model
> * Assign features to X and target variable to y
> * Create a pipeline for preprocessing which includes imputing and standardscaling
> * Create a pipeline for preprocessor and the random forest classifier. Set n_estimators=100
> * Train the model and make your predictions on the test dataset
> * List the feature importance and the corresponding features
> * Create a bar plot for the feature importances
> * Drop the least important feature from the data and repeat the steps above

In [37]:
# Create a Numerical Transformer
numerical_transformer = Pipeline(steps=[ 
    ('imputer', SimpleImputer(strategy='median')), 
    ('sclaler', StandardScaler()) ])
numerical_transformer