## Lab data-cleaning-challenge

#### Imports

In [15]:
import numpy as np
import pandas as pd
import warnings
import time
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

#### Functions

In [16]:
# Categorical data analyser
def cat_var(df, cols):
    '''
    Return: a Pandas dataframe object with the following columns:
        - "categorical_variable" => every categorical variable include as an input parameter (string).
        - "number_of_possible_values" => the amount of unique values that can take a given categorical variable (integer).
        - "values" => a list with the posible unique values for every categorical variable (list).

    Input parameters:
        - df -> Pandas dataframe object: a dataframe with categorical variables.
        - cols -> list object: a list with the name (string) of every categorical variable to analyse.
    '''
    cat_list = []
    for col in cols:
        cat = df[col].unique()
        cat_num = len(cat)
        cat_dict = {"categorical_variable":col,
                    "number_of_possible_values":cat_num,
                    "values":cat}
        cat_list.append(cat_dict)
    df = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values", ascending=False)
    return df.reset_index(drop=True)

### Study the iris dataset

In [17]:
# Extract Dataset
iris_df = pd.read_csv('iris-data.csv')
col_iris = list(iris_df.columns)
print(iris_df.shape)
iris_df.head()

(150, 5)


Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [18]:
# Dataset info
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sepal_length_cm  150 non-null    float64
 1   sepal_width_cm   150 non-null    float64
 2   petal_length_cm  150 non-null    float64
 3   petal_width_cm   145 non-null    float64
 4   class            150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [19]:
iris_df.describe()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm
count,150.0,150.0,150.0,145.0
mean,5.644627,3.054667,3.758667,1.236552
std,1.312781,0.433123,1.76442,0.755058
min,0.055,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.4
50%,5.7,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [20]:
# Features analysis
cat_iris = cat_var(iris_df, col_iris)
cat_iris

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,petal_length_cm,43,"[1.4, 1.3, 1.5, 1.7, 1.6, 1.1, 1.2, 1.0, 1.9, ..."
1,sepal_length_cm,38,"[5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.4, 4.8, 5.7, ..."
2,sepal_width_cm,23,"[3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 2.9, 3.7, ..."
3,petal_width_cm,23,"[0.2, 0.4, 0.3, nan, 0.1, 0.5, 0.6, 1.4, 1.5, ..."
4,class,5,"[Iris-setosa, Iris-setossa, Iris-versicolor, v..."


In [21]:
# Count the number of item per class type
dup_counts = iris_df.pivot_table(index=['class'], aggfunc='size')
dup_counts

class
Iris-setosa        49
Iris-setossa        1
Iris-versicolor    45
Iris-virginica     50
versicolor          5
dtype: int64

### Cleaning

As can be seen, the dataset has 150 records and 5 characteristics, 4 of them are numerical and the last one is categorical. Some errors have been observed that need to be corrected: 
- It includes 5 NaN values in the 'petal_width_cm' column. Since there are only 5 values out of 150, I think the best way to correct these values will be to remove them from the dataset. 
- Class names appear to be incorrect in some cases: 'Iris-setossa' (there is one more 's') and 'versicolor' where 'Iris-' is not included as in the others.

#### NaN correction

In [22]:
iris_clean_df = iris_df.dropna()
iris_clean_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, 0 to 149
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sepal_length_cm  145 non-null    float64
 1   sepal_width_cm   145 non-null    float64
 2   petal_length_cm  145 non-null    float64
 3   petal_width_cm   145 non-null    float64
 4   class            145 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.8+ KB


#### Class correction

In [23]:
def class_correction(value):
    if value == 'Iris-setossa':
        return 'Iris-setosa'
    elif value == 'versicolor':
        return 'Iris-versicolor'
    return value

In [11]:
warnings.filterwarnings("ignore")
iris_clean_df['class'] = iris_clean_df['class'].apply(class_correction)
iris_clean_df

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,2.3,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [25]:
# Count the number of item per class type
dup_counts = iris_clean_df.pivot_table(index=['class'], aggfunc='size')
dup_counts

class
Iris-setosa        44
Iris-setossa        1
Iris-versicolor    45
Iris-virginica     50
versicolor          5
dtype: int64

### Encoding Categorical Data

There is only 3 caterogical data in the class column

In [26]:
warnings.filterwarnings("ignore")
iris_enc_df = iris_clean_df   # Create an empty dataframe

enc_label = LabelEncoder()
iris_enc_df['class'] = enc_label.fit_transform(iris_enc_df['class'])
iris_enc_df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


### Scaling

In [27]:
# Obtain the columns name
columns_name = iris_enc_df.columns

# Scaling
transformer_standard = StandardScaler().fit(iris_enc_df)
iris_standard = transformer_standard.transform(iris_enc_df)

# Prepare the dataframe
iris_standard_df = pd.DataFrame(iris_standard)
iris_standard_df.columns = columns_name
iris_standard_df.head()

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,-0.431408,1.046588,-1.402789,-1.377569,-1.395622
1,-0.582699,-0.106567,-1.402789,-1.377569,-1.395622
2,-0.73399,0.354695,-1.460362,-1.377569,-1.395622
3,-0.809635,0.124064,-1.345216,-1.377569,-1.395622
4,-0.507054,1.277219,-1.402789,-1.377569,-1.395622


### Store the dataframe

In [53]:
iris_standard_df.to_csv('iris-data_clean.csv')