# Assignment - Classification

In [22]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

In [23]:
pip install xlrd

Note: you may need to restart the kernel to use updated packages.


In [26]:
iris = pd.read_excel("iris2.xls")

In [27]:
iris

Unnamed: 0,SL,SW,PL,PW,Classification
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [28]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              143 non-null    float64
 1   SW              144 non-null    float64
 2   PL              144 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


## Data Preprocessing

In [29]:
imputer = SimpleImputer(strategy='mean')

columns_with_missing_values = ['SL', 'SW', 'PL', 'PW']

iris[columns_with_missing_values] = imputer.fit_transform(iris[columns_with_missing_values])

In [30]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   SL              150 non-null    float64
 1   SW              150 non-null    float64
 2   PL              150 non-null    float64
 3   PW              150 non-null    float64
 4   Classification  150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [31]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier


In [33]:
X = iris.drop('Classification', axis=1)
y = iris['Classification']

## Data Splitting

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Feature Scaling

In [36]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Classification Models

## Logistic Regression

In [37]:
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
logistic_pred = logistic_model.predict(X_test)
logistic_accuracy = accuracy_score(y_test, logistic_pred)
logistic_confusion = confusion_matrix(y_test, logistic_pred)

In [38]:
print("Logistic Regression:")
print("Accuracy:", logistic_accuracy)
print("Confusion Matrix:\n", logistic_confusion)

Logistic Regression:
Accuracy: 0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]


## Support Vector Machine

In [39]:
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)
svm_confusion = confusion_matrix(y_test, svm_pred)

In [40]:
print("Support Vector Machine (SVM):")
print("Accuracy:", svm_accuracy)
print("Confusion Matrix:\n", svm_confusion)

Support Vector Machine (SVM):
Accuracy: 0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]


## Decision Tree

In [41]:
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
decision_tree_pred = decision_tree_model.predict(X_test)
decision_tree_accuracy = accuracy_score(y_test, decision_tree_pred)
decision_tree_confusion = confusion_matrix(y_test, decision_tree_pred)

In [42]:
print("Decision Tree:")
print("Accuracy:", decision_tree_accuracy)
print("Confusion Matrix:\n", decision_tree_confusion)

Decision Tree:
Accuracy: 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## Random Forest

In [43]:
random_forest_model = RandomForestClassifier()
random_forest_model.fit(X_train, y_train)
random_forest_pred = random_forest_model.predict(X_test)
random_forest_accuracy = accuracy_score(y_test, random_forest_pred)
random_forest_confusion = confusion_matrix(y_test, random_forest_pred)

In [44]:
print("Random Forest:")
print("Accuracy:", random_forest_accuracy)
print("Confusion Matrix:\n", random_forest_confusion)

Random Forest:
Accuracy: 1.0
Confusion Matrix:
 [[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


## K-Nearest Neighbors (K-NN)

In [45]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_pred = knn_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_pred)
knn_confusion = confusion_matrix(y_test, knn_pred)

In [46]:
print("K-Nearest Neighbors (K-NN):")
print("Accuracy:", knn_accuracy)
print("Confusion Matrix:\n", knn_confusion)

K-Nearest Neighbors (K-NN):
Accuracy: 0.9666666666666667
Confusion Matrix:
 [[10  0  0]
 [ 0  8  1]
 [ 0  0 11]]


Both the Decision Tree and Random Forest models achieved a perfect accuracy of 1.0 on the test data. Therefore, they seem to give the best results in terms of accuracy.

Considering that the both models achieve a perfect accuracy, the Random Forest model is generally preferred due to its ensemble nature. It is less prone to overfitting and can handle more complex datasets effectively.