<h1 style="text-align:center">Codify Demonstration</h1>

## Iris Dataset 

<br>

<h3>
Classes Info<br>
    <ul>
    <li>Iris-setosa = 0 </li>
<li>Iris-versicolor = 1 </li>
<li>Iris-virginica = 2 </li>
    </ul>
</h3>

### load csv file

In [10]:
# load csv file
import pandas as pd
df = pd.read_csv("./iris.csv", encoding="utf-8")

### head of dataset

In [11]:
# head of dataset
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0


### display null values in percent

In [13]:
# display null values in percent
from IPython.display import display

def null_columns_percentage(df) -> pd.DataFrame:
    '''
        Prints Null Information of dataframe,i.e. only the number of rows having null values and their null percentage
    '''
    print("\nNull Information of Dataframe: \n")
    null_df = pd.DataFrame(df.isnull().sum()).reset_index()
    null_df.columns = ["column_name", "null_rows"]
    null_df["null_percentage"] = null_df["null_rows"]*100 / df.shape[0]
    null_df = null_df[null_df["null_percentage"] != 0].sort_values(
        "null_percentage", ascending=False).reset_index(drop=True)
    print(
        f"\nThere are total {null_df.shape[0]} columns having null values out of {df.shape[1]} columns in dataframe\n")
    display(null_df)
    return null_df

null_columns_percentage(df)


Null Information of Dataframe: 


There are total 4 columns having null values out of 6 columns in dataframe



Unnamed: 0,column_name,null_rows,null_percentage
0,SepalLengthCm,2,1.333333
1,SepalWidthCm,2,1.333333
2,PetalWidthCm,2,1.333333
3,PetalLengthCm,1,0.666667


Unnamed: 0,column_name,null_rows,null_percentage
0,SepalLengthCm,2,1.333333
1,SepalWidthCm,2,1.333333
2,PetalWidthCm,2,1.333333
3,PetalLengthCm,1,0.666667


### replace null values with their mean values

In [14]:
# replace null values with their mean values
df.fillna(df.mean(), inplace=True)
print('Missing: %d' % df.isnull().sum().sum())

Missing: 0


### dataset statistics

In [15]:
# dataset statistics


def columns_type(df) -> tuple:
    '''
        return number of categorical and numerical columns
    '''
    numerics = ['int16', 'int32', 'int64',
                'float16', 'float32', 'float64']
    num_cols = df.select_dtypes(include=numerics).columns.tolist()
    cat_columns = len(df.select_dtypes(
        include="O").columns.tolist())
    num_columns = len(num_cols)
    return (cat_columns, num_columns)


def null_columns_percentage(df) -> pd.DataFrame:
    '''
    Prints Null Information of dataframe,i.e. only the number of rows having null values and their null percentage
    '''
    print("\nNull Information of Dataframe: \n")
    null_df = pd.DataFrame(df.isnull().sum()).reset_index()
    null_df.columns = ["column_name", "null_rows"]
    null_df["null_percentage"] = null_df["null_rows"] * \
        100 / df.shape[0]
    null_df = null_df[null_df["null_percentage"] != 0].sort_values(
        "null_percentage", ascending=False).reset_index(drop=True)
    print(
        f"\nThere are total {null_df.shape[0]} columns having null values out of {df.shape[1]} columns in dataframe\n")
    display(null_df)
    return null_df


def statistics(df) -> None:
    '''
    Gives number of categorical and numerical columns, description and info regarding the dataframe
    '''
    attribute_types = columns_type(df)
    print(
        f"\nThere are total {attribute_types[0]} categorical and {attribute_types[1]} numerical columns\n")

    print("Description of Data:\n")
    display(df.describe())

    print("Information regarding data: \n")
    display(df.info())

    if df.isnull().sum().sum() > 0:
        null_columns_percentage(df)
    else:
        print(
            "\nCongrats!!, The Dataframe has NO NULL VALUES\n")
        
statistics(df)


There are total 0 categorical and 6 numerical columns

Description of Data:



Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
count,150.0,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.854054,3.056081,3.775168,1.202027,1.0
std,43.445368,0.818279,0.429332,1.752808,0.760825,0.819232
min,1.0,4.4,2.0,1.0,0.1,0.0
25%,38.25,5.1,2.8,1.6,0.3,0.0
50%,75.5,5.8,3.0,4.35,1.3,1.0
75%,112.75,6.4,3.3,5.1,1.8,2.0
max,150.0,7.9,4.4,6.9,2.5,2.0


Information regarding data: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    int64  
dtypes: float64(4), int64(2)
memory usage: 7.2 KB


None


Congrats!!, The Dataframe has NO NULL VALUES



In [16]:
X = df.drop("Species", axis=1)
y = df["Species"]

print("Shape of X: ", X.shape)
print("Shape of y: ", y.shape)

Shape of X:  (150, 5)
Shape of y:  (150,)


### Split the dataset into train and test set

In [17]:
# Split the dataset into train and test set

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=41)

print("Shape of X train = ", X_train.shape)
print("Shape of y train = ", y_train.shape)
print("Shape of X test = ", X_test.shape)
print("Shape of y test = ", y_test.shape)


Shape of X train =  (120, 5)
Shape of y train =  (120,)
Shape of X test =  (30, 5)
Shape of y test =  (30,)


### fit all classification models

In [18]:
# fit all classification models

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

model_dict = {1: "Support Vector Machine", 2: "Decision Tree",
              3: "Random Forest", 4: "KNN", 5: "Naive Bayes"}


def model(X_train, y_train, X_test, y_test):

    # Logistic Regression
    #     logistic_classifier = LogisticRegression()
    #     logistic_classifier.fit(X_train,y_train)
    #     logistic_classifier_score = logistic_classifier.score(X_test, y_test)
    #     print("Logistic Regression Accuracy: ", logistic_classifier_score * 100)

    # Support Vector Classification
    sv_classifier = SVC(kernel="rbf")
    sv_classifier.fit(X_train, y_train)
    sv_classifier_score = sv_classifier.score(X_test, y_test)
    print("Support Vector Machine Accuracy: ", sv_classifier_score * 100)

    # Decision Tree
    dt_classifier = DecisionTreeClassifier(criterion="gini")
    dt_classifier.fit(X_train, y_train)
    dt_classifier_score = dt_classifier.score(X_test, y_test)
    print("Decision Tree Accuracy: ", dt_classifier_score * 100)

    # Random Forest
    rf_classifier = RandomForestClassifier(n_estimators=100, criterion="gini")
    rf_classifier.fit(X_train, y_train)
    rf_classifier_score = rf_classifier.score(X_test, y_test)
    print("Random Forest Accuracy: ", rf_classifier_score * 100)

    # KNN
    knn_classifier = KNeighborsClassifier(n_neighbors=5)
    knn_classifier.fit(X_train, y_train)
    knn_classifier_score = knn_classifier.score(X_test, y_test)
    print("KNN Accuracy: ", knn_classifier_score * 100)

    # Naive Bayes
    naive_classifier = GaussianNB()
    naive_classifier.fit(X_train, y_train)
    naive_classifier_score = naive_classifier.score(X_test, y_test)
    print("Naive Bayes Accuracy: ", naive_classifier_score * 100)

    return sv_classifier, dt_classifier, rf_classifier, knn_classifier, naive_classifier


models = model(X_train, y_train, X_test, y_test)


Support Vector Machine Accuracy:  96.66666666666667
Decision Tree Accuracy:  96.66666666666667
Random Forest Accuracy:  100.0
KNN Accuracy:  100.0
Naive Bayes Accuracy:  96.66666666666667


### evaluate classification models

In [19]:
# evaluate classification 

from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

i = 0
for key, value in model_dict.items():
    print(f"{value} Model ")
    print(f"\nClassification Report: \n{classification_report(y_test, models[i].predict(X_test))}")
    print(f"\n{value} model accuracy: {round(accuracy_score(y_test, models[i].predict(X_test)), 2)}")
    print(f'\n{"-"*55}\n')
    i+=1

Support Vector Machine Model 

Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.89      0.94         9
           1       0.92      1.00      0.96        11
           2       1.00      1.00      1.00        10

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30


Support Vector Machine model accuracy: 0.97

-------------------------------------------------------

Decision Tree Model 

Classification Report: 
              precision    recall  f1-score   support

           0       1.00      0.89      0.94         9
           1       0.92      1.00      0.96        11
           2       1.00      1.00      1.00        10

    accuracy                           0.97        30
   macro avg       0.97      0.96      0.97        30
weighted avg       0.97      0.97      0.97        30


Decision Tree model accuracy: 0.9

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

i = 0
for key, value in model_dict.items():
print(f"{value} Model ")
print(f"
Classification Report:
{classification_report(y_test, models[i].predict(X_test))}")
print(f"
{value} model accuracy: {round(accuracy_score(y_test, models[i].predict(X_test)), 2)}")
print(f'
{"-"*55}
')
i+=1

IndentationError: expected an indented block (Temp/ipykernel_15396/4115833372.py, line 6)