In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
# This ensures plots created are displayed in this notebook 
%matplotlib inline

In [3]:
import random 
# Ensures tree created is easy to read and understand
from pprint import pprint

In [4]:
# Load and Prepare Data
# Format of the data: 
# Last Column of the data frame must contain the label,
# and it must be called "label".
# There should be no missing values in the data frame!


# The Pandas module is used for working with tabular data. It allows us to work with data in table form, such as in CSV or SQL database formats. We can also create tables of our own, and edit or add columns or rows to tables. Pandas provides us with some powerful objects like DataFrames and Series which are very useful for working with and analyzing data.

# The Numpy module is mainly used for working with numerical data. It provides us with a powerful object known as an Array. With Arrays, we can perform mathematical operations on multiple values in the Arrays at the same time, and also perform operations between different Arrays, similar to matrix operations.

# Last, but not least, the Matplotlib module is used for data visualization. It provides functionality for us to draw charts and graphs, so that we can better understand and present the data visually.

# Indeed, pandas provides high level data manipulation tools built on top of NumPy. NumPy by itself is a fairly low-level tool, and will be very much similar to using MATLAB. pandas on the other hand provides rich time series functionality, data alignment, NA-friendly statistics, groupby, merge and join methods, and lots of other conveniences.


In [5]:
df = pd.read_csv("Iris.csv")
# Drop the 'Id' column as provides no useful info.
df = df.drop("Id", axis = 1)
# Rename the 'Species' column to 'label'
df = df.rename(columns = {"Species" : "label"})
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
# Check no missing values in the dataframe
# States for us that there are 150 entries in total, with 
# each column having 150 non-null items
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SepalLengthCm  150 non-null    float64
 1   SepalWidthCm   150 non-null    float64
 2   PetalLengthCm  150 non-null    float64
 3   PetalWidthCm   150 non-null    float64
 4   label          150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [13]:
# Train Test Split
def train_test_split(df, test_size):
    # We want to keep our code flexible, where we see if the user passes in the # of 
    # rows for the test_size versus a proportion for the test_size
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))
    # So if test_size is 0.10, and df is len 150, test_size becomes 15.
    # Round in case we encounter a float after multiplying

    indices = df.index.tolist()
    # Now create the test_df, randomly from these indices
    test_indices = random.sample(population = indices, k = test_size)

    # Use the test_indices, which is an array of size test_size with indices
    # that represent those we want to create our test_df with 
    test_df = df.loc[test_indices]

    # Do the same for train indices, use df.drop
    # This omits the rows indicated by test_indices
    train_df = df.drop(test_indices)
    return train_df, test_df

In [19]:
random.seed(0)
train_df, test_df = train_test_split(df, test_size = 20)

In [22]:
test_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
98,5.1,2.5,3.0,1.1,Iris-versicolor
107,7.3,2.9,6.3,1.8,Iris-virginica
10,5.4,3.7,1.5,0.2,Iris-setosa
66,5.6,3.0,4.5,1.5,Iris-versicolor
130,7.4,2.8,6.1,1.9,Iris-virginica


In [40]:
# We will use a numpy array here, as it's faster than a pandas df.
# To get a numpy 2D array from a pandas df, use the .values attribute 
data = train_df.values 
# Get first 0-5 rows.
data[0:5]

array([[5.1, 3.5, 1.4, 0.2, 'Iris-setosa'],
       [4.9, 3.0, 1.4, 0.2, 'Iris-setosa'],
       [4.7, 3.2, 1.3, 0.2, 'Iris-setosa'],
       [4.6, 3.1, 1.5, 0.2, 'Iris-setosa'],
       [5.0, 3.6, 1.4, 0.2, 'Iris-setosa']], dtype=object)

In [45]:
### Data pure - Check if a partition contains one class (pure) or if 
# it contains multiple classes (not pure)
def check_purity(data):
    # Get all the labels for each row at the last col
    label_column = data[:, -1]
    # Find number of distinct classes in array 
    unique_classes = np.unique(label_column)

    # if there's one class, data is pure
    if len(unique_classes) == 1:
        return True 
    else:
        return False 


In [47]:
check_purity(train_df.values)

False

In [49]:
# Returns true as only one class with this condition
check_purity(train_df[train_df.PetalWidthCm < 0.8].values)

True

In [56]:
### Classify 

def classify_data(data):
    # Get all the labels for each row at the last col
    label_column = data[:, -1]
    # Determine which class appears most often 
    # The return_counts returns two arrays, first with unique vals, second with counts
    # of how many times those unique vals occurred. 
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts = True)

    # Get index of most occurring element. This returns '0' here
    # as first element is most occurring. Then use this '0'
    # to dereference unique_classes
    index = counts_unique_classes.argmax()
    # set the classification label
    classification = unique_classes[index]
    return classification


In [66]:
classify_data(train_df[(train_df.PetalWidthCm > 0.8) & (train_df.PetalWidthCm < 2.0)].values)

'Iris-versicolor'