# Advanced Certification in AIML
## A Program by IIIT-H and TalentSprint
## Not for grading

## Dataset 

#### Description
The Iris dataset consists of 150 data instances. There are 3 classes (Iris Versicolor, Iris Setosa and Iris Virginica) each have 50 instances. 


For each flower we have the below data attributes 

- sepal length in cm
- sepal width in cm
- petal length in cm
- petal width in cm

To make our experiment easy we rename the classes  with numbers : 

    "0": setosa
    "1": versicolor
    "2": virginica

In [None]:
!wget https://cdn.talentsprint.com/aiml/Experiment_related_data/Iris.csv

### Import required packages

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.utils import resample
from sklearn import metrics

### Load the data

In [None]:
# Load the iris dataset
iris = pd.read_csv("Iris.csv")
iris = iris.drop("Id",axis=1)
iris.head()


In [None]:
# Species from Iris dataset
iris.species.unique()

In [None]:
# Convert the labels to numericals
converter = {"Iris-setosa":0, "Iris-versicolor": 1,"Iris-virginica":2}
iris["species"] = [converter [i] for i in iris["species"]]

In [None]:
# Split the data into train and test data
train_data, test_data = train_test_split(iris, test_size=0.2, random_state=42) 
len(train_data), len(test_data)

### Sampling with replacement

In [None]:
# Function to create 5 subsets with replacement. nTimes = No. of Subsets; howmany = No. of samples in a subset

def select_samples(nTimes, howmany, train_data):
  subsets = []
  for i in range(nTimes):
    subset_i = resample(train_data, n_samples=howmany, replace=True)
    subsets.append(subset_i)

    # To find number of unique samples in a subset
    unique_samples = len(np.unique(subset_i, axis=0))

    # To find no. of repeated samples in a subset
    repeated_samples = len(subset_i)-len(np.unique(subset_i, axis=0))
       
    print("D%d has %d samples in which %d are unique samples and %d are repeated samples" %(i,len(subset_i), unique_samples, repeated_samples))
  return subsets

In [None]:
# Call the above function to create 5 subsets for train data, each of size 120
subsets = select_samples(5, 120, train_data) 

In [None]:
# Initialize the Decision tree
decision_tree = DecisionTreeClassifier(max_depth=2)

In [None]:
# Classify each subset using Decision tree
def DT_subset(train_data, test_data, model):
  
  # Extract features and labels of the train_data and test_data
  X_train = train_data.iloc[:,:-1]
  y_train = train_data.iloc[:, -1] 
  X_test = test_data.iloc[:,:-1]
  y_test = test_data.iloc[:,-1]
  
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  score = metrics.accuracy_score(y_pred, y_test)
  
  return model, score

In [None]:
# Calculate accuracy for each subset
for i,each in enumerate(subsets):
    model, score = DT_subset(each, test_data, decision_tree)
    print("Accuracy for {} subset: {}".format(i, score))

In [None]:
print("Actuals and Predictions of 30 test samples")
test_labels = test_data.iloc[:, -1].astype(int)

# Create a dictionary for storing the labels
labels_30 = {"actual_values": test_labels}

# Get the prediction labels of 30 samples for all subsets
for i in range(1,6):
  model,score = DT_subset(subsets[i-1], test_data, decision_tree)
  print("Subset_",i, "Accuracy is", score)
  test_features = test_data.iloc[:,:-1]
  y_pred30 = model.predict(test_features)
  pred_30 = y_pred30.astype(int)
  labels_30["subset"+ str(i)] = pred_30
  
# Create a dataframe of 30 test samples with actuals and predictions of all 5 subsets
df_test = pd.DataFrame(labels_30)
df_test