# Importing the Libraries

##### In order to perform data preprocessing using Python, we need to import some predefined Python libraries. These libraries are used to perform some specific jobs. here are three specific libraries that we will use for data preprocessing, which are: Numpy, Pandas and Matplotlib.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import tensorflow as tf

# Importing the Dataset

##### Now, we need to import the dataset which we have collected for our classification project. In order to import the dataset, we will use read_csv() function of pandas library, which is used to read a csv file and performs various operations on it.

In [None]:
dataset = pd.read_csv("iris.data.csv", header = None)
dataset.columns = ["Sepal length", "Sepal width", "Petal length", "Petal width", "Species"]
X = dataset.iloc[:,:4].values
y = dataset.iloc[:, -1].values

In [None]:
print(dataset.shape)

In [None]:
print(dataset.head())

In [None]:
print(dataset.info())

In [None]:
print(dataset.describe())

In [None]:
print(dataset.nunique())

In [None]:
print(dataset["Species"].unique())

# Data Preprocessing

## Finding Missing Data

##### The next step of data preprocessing is to handle missing data in the dataset. However, in this case we find that there are no missing values in the dataset.

In [None]:
print(dataset.isna().sum())

## Encoding Categorical Data - Dependent Variable

##### Here, the categorical variable is "Species". Since Machine Learning models are primarily based on mathematical equations, we need to encode the categorical variable and the technique used here is Label Encoding - Converting categorical columns into numerical ones.

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)
print(y)

## Splitting the dataset into the Training set and Test set

##### Splitting the dataset is the next step in data preprocessing in machine learning. Every dataset must be split into two separate sets – training set (we already know the output) and test set (model predicts the output).

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 23)

## Feature Scaling

##### Feature scaling is the final step of data preprocessing, where we standardize the independent variables of the dataset in a specific range. In feature scaling, we put our variables in the same range and in the same scale so that no any variable dominate the other variable. Here, we use the Standardization method for our dataset.

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## PCA

#### Principal Component Analysis is done to reduce the number of dimensions in the training dataset and de-noise the data, thereby employing Feature Extraction.

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 4)
temp = pca.fit_transform(X)

In [None]:
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
pca = PCA(n_components = 2)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train_pca, y_train)

In [None]:
y_pred = classifier.predict(X_test_pca)
print(y_pred)

## Performance Evaluation

##### Since accuracy can be very misleading as it does not take class imbalance into account, we look into other Multiclass Classification metrics like:

1. Cohen’s Kappa score
2. Matthew’s correlation coefficient
3. Classification report

##### All of the metrics used are associated with confusion matrices in one way or the other.

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, matthews_corrcoef, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy:", round(accuracy_score(y_test, y_pred),3))
print("Cohen’s Kappa score:", round(cohen_kappa_score(y_test, y_pred),3))
print("Matthew’s correlation coefficient:", round(matthews_corrcoef(y_test, y_pred),3))
print("Classification report:")
print(classification_report(y_test, y_pred, target_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']))

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(confusion_matrix(y_test, y_pred), annot = True, cmap = 'summer')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_train_pca, y_train
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                color = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Training set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

In [None]:
from matplotlib.colors import ListedColormap
X_set, y_set = X_test_pca, y_test
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01),
                     np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01))
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
             alpha = 0.75, cmap = ListedColormap(('red', 'green', 'blue')))
plt.xlim(X1.min(), X1.max())
plt.ylim(X2.min(), X2.max())
for i, j in enumerate(np.unique(y_set)):
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1],
                color = ListedColormap(('red', 'green', 'blue'))(i), label = j)
plt.title('Logistic Regression (Test set)')
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.legend()
plt.show()

# ANN - Artificial Neural Network

In [None]:
from keras.models import Sequential 
from keras.layers import Dense, Activation, Dropout
from keras.utils import np_utils

##### Changing the label to One Hot Vector using One Hot Encoding - Represent categorical variables as numerical values

In [None]:
y_train = np_utils.to_categorical(y_train, num_classes = 3)
y_test = np_utils.to_categorical(y_test, num_classes = 3)

In [None]:
print(y_train)

In [None]:
model = Sequential()
model.add(Dense(units = 1000, input_dim = 4, activation = 'relu'))
model.add(Dense(units = 500, activation = 'relu'))
model.add(Dense(units = 300, activation = 'relu'))
model.add(Dropout(0.2))
model.add(Dense(units = 3, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
model.summary()

In [None]:
model.fit(X_train, y_train, validation_data = (X_test, y_test), batch_size = 20, epochs = 10, verbose = 1)

In [None]:
y_pred = model.predict(X_test)
y_test = np.argmax(y_test, axis = 1)
y_pred = np.argmax(y_pred, axis = 1)

In [None]:
print(y_pred)

## Performance Evaluation

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, matthews_corrcoef, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("Accuracy:", round(accuracy_score(y_test, y_pred),3))
print("Cohen’s Kappa score:", round(cohen_kappa_score(y_test, y_pred),3))
print("Matthew’s correlation coefficient:", round(matthews_corrcoef(y_test, y_pred),3))
print("Classification report:")
print(classification_report(y_test, y_pred, target_names = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']))

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(confusion_matrix(y_test, y_pred), annot = True, cmap = 'summer')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()