# Day 2

## Import necessary packages

In [37]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder

## Work with the dataset

In [38]:
data = pd.read_excel('resources/test_data.xlsx')    # Read the dataset
data.head() # Check the first 5 rows. We can also use data.head(n) to check the first n rows

Unnamed: 0,f1,f2,f3,f4
0,1,2,3.0,4.0
1,5,6,,7.0
2,0,6,9.0,


In [39]:
data.isnull().sum() # Check for missing values (NaN) in each column

f1    0
f2    0
f3    1
f4    1
dtype: int64

## Work out all the null(NaN) values

In [40]:
data.dropna()    # Drop all rows with missing values

Unnamed: 0,f1,f2,f3,f4
0,1,2,3.0,4.0


In [41]:
data.dropna(axis=1) # Drop all columns with missing values

Unnamed: 0,f1,f2
0,1,2
1,5,6
2,0,6


## Fill in the missing values using sample-imputer

In [42]:
imr = SimpleImputer(missing_values=np.nan, strategy='mean') # Create an imputer object. This replaces all NaN values with the mean of the column
imr = imr.fit(data) # Fit the imputer object to the dataset
imputed_data = imr.transform(data)  # Transform the dataset. This is the new dataset where the NaN values have been replaced
print(f"Original dataset:\n {data}\n\nImputed dataset:\n {imputed_data}") # Print the original and imputed datasets

Original dataset:
    f1  f2   f3   f4
0   1   2  3.0  4.0
1   5   6  NaN  7.0
2   0   6  9.0  NaN

Imputed dataset:
 [[1.  2.  3.  4. ]
 [5.  6.  6.  7. ]
 [0.  6.  9.  5.5]]


## Work with the categorical variables in the .csv file

In [43]:
data = pd.read_csv('resources/iris.csv', header = None)    # Read the dataset. The header is set to None because the dataset has no header. Else it will read the first row as the header.
data.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [44]:
data.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class'] # Set the column names, replacing the default numerical column names
data.head(10)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [45]:
## Load and replace the values in the dataset with integers and map the values to the said integers

In [46]:
np.unique(data['class']) # Check the unique values in the class column

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [47]:
mapping = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} # Create a dictionary to map the class values to integers
data['class'] = data['class'].map(mapping) # Map the class values to integers
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [50]:
le = LabelEncoder() # LabelEncoder will convert the class values to integers
data['data'] = le.fit_transform(data['class']) # Fit the LabelEncoder object to the class column and transform the class column to integers
data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class,data
0,5.1,3.5,1.4,0.2,0,0
1,4.9,3.0,1.4,0.2,0,0
2,4.7,3.2,1.3,0.2,0,0
3,4.6,3.1,1.5,0.2,0,0
4,5.0,3.6,1.4,0.2,0,0
