In [2]:
import pandas as pd
import matplotlib as plt
import numpy as np
import scipy as sp
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


In [11]:
irisClasses = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'class']
irisFilePath = r'./DATASETS/iris/iris.data'

parkinsonsClasses = [
    "subject#", "ID","age", "test_time","Jitter(%)", "Jitter(Abs)", "Jitter:RAP",
     "Jitter:PPQ5","Jitter:DDP", "Shimmer","Shimmer(dB)", "Shimmer:APQ3", "Shimmer:APQ5",
    "Shimmer:APQ11","Shimmer:DDA","NHR","HNR", "RPDE", "DFA", "PPE","motor_UPDRS", "total_UPDRS", "sex"
]
parkinsonsFilePath = r'./DATASETS/parkinsons_telemonitoring/parkinsons_updrs.data'

mushroomFilepath = r'./DATASETS/MushroomDataset/secondary_data.csv'
mushroomClasses = [
    "cap-diameter","cap-shape","cap-surface","cap-color","does-bruise-bleed",
    "gill-attachment","gill-spacing","gill-color","stem-height","stem-width",
    "stem-root","stem-surface","stem-color","veil-type","veil-color",
    "has-ring","ring-type","spore-print-color","habitat","season"
]  

phiPhhishingFilePath = r'./DATASETS/phiusiil_phishing_url_dataset/PhiUSIIL_Phishing_URL_Dataset.csv'

irisData = pd.read_csv(irisFilePath, names=irisClasses, on_bad_lines='skip')
parkinsonsData = pd.read_csv(parkinsonsFilePath, names=parkinsonsClasses, on_bad_lines='skip')
mushroomData = pd.read_csv(mushroomFilepath, names=mushroomClasses, on_bad_lines='skip', sep=';')
phiPhhishingData = pd.read_csv(phiPhhishingFilePath, on_bad_lines='skip')


In [12]:
mushroomData.head()

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
p,15.26,x,g,o,f,e,,w,16.95,17.09,s,y,w,u,w,t,g,,d,w
p,16.6,x,g,o,f,e,,w,17.99,18.19,s,y,w,u,w,t,g,,d,u
p,14.07,x,g,o,f,e,,w,17.8,17.74,s,y,w,u,w,t,g,,d,w
p,14.17,f,h,e,f,e,,w,15.77,15.98,s,y,w,u,w,t,p,,d,w
p,14.64,x,h,o,f,e,,w,16.53,17.2,s,y,w,u,w,t,p,,d,w


In [8]:
phiPhhishingData.keys()

Index(['FILENAME', 'URL', 'URLLength', 'Domain', 'DomainLength', 'IsDomainIP',
       'TLD', 'URLSimilarityIndex', 'CharContinuationRate',
       'TLDLegitimateProb', 'URLCharProb', 'TLDLength', 'NoOfSubDomain',
       'HasObfuscation', 'NoOfObfuscatedChar', 'ObfuscationRatio',
       'NoOfLettersInURL', 'LetterRatioInURL', 'NoOfDegitsInURL',
       'DegitRatioInURL', 'NoOfEqualsInURL', 'NoOfQMarkInURL',
       'NoOfAmpersandInURL', 'NoOfOtherSpecialCharsInURL',
       'SpacialCharRatioInURL', 'IsHTTPS', 'LineOfCode', 'LargestLineLength',
       'HasTitle', 'Title', 'DomainTitleMatchScore', 'URLTitleMatchScore',
       'HasFavicon', 'Robots', 'IsResponsive', 'NoOfURLRedirect',
       'NoOfSelfRedirect', 'HasDescription', 'NoOfPopup', 'NoOfiFrame',
       'HasExternalFormSubmit', 'HasSocialNet', 'HasSubmitButton',
       'HasHiddenFields', 'HasPasswordField', 'Bank', 'Pay', 'Crypto',
       'HasCopyrightInfo', 'NoOfImage', 'NoOfCSS', 'NoOfJS', 'NoOfSelfRef',
       'NoOfEmptyRef', 'NoOf

# Section One: IRIS Data

In [None]:
print(f'''Unique Cols of Iris Data: {list(set(irisData))}''')

In [None]:
X = irisData.iloc[:, :-1].values
Y = irisData.iloc[:, -1].values
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.4, random_state=36827814)

In [None]:
irisClassifier = DecisionTreeClassifier(min_samples_split=4, max_depth=10)
timeTaken, memoryUsed = trackTimeMemory(irisClassifier, X_train, Y_train, X_test, Y_test)
print(f"Training Time: {timeTaken:.4f} seconds")
print(f"Memory Usage: {memoryUsed:.4f} MB")

In [None]:
Y_pred = irisClassifier.predict(X_test)
Accuracy = accuracy_score(Y_test, Y_pred)
classReport = classification_report(Y_test, Y_pred, output_dict=True)
for label, metrics in classReport.items():
    if label not in ['accuracy', 'macro avg', 'weighted avg']:
        print(f"Class: {label}")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")
    else:
        print(f"{label.capitalize()}:")
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                print(f"  {metric}: {value:.4f}")
        else:
            print(f"  {metrics:.4f}")

In [None]:
data = {
    'min_samples_split': 4,  
    'max_depth': 10,
    'time_taken': timeTaken,
    'memory_used': memoryUsed,
    'accuracy': Accuracy,
    'class_report': classReport
}

saveToCSV('irisClassifier.csv', data)

In [None]:
irisSK = DecisionTreeClassifier(min_samples_split=4, max_depth=10, random_state=36827814)
timeTaken, memoryUsed = trackTimeMemory(irisSK, X_train, Y_train, X_test, Y_test)
print(f"Training Time: {timeTaken:.4f} seconds")
print(f"Memory Usage: {memoryUsed:.4f} MB")

Y_pred = irisSK.predict(X_test)
Accuracy = accuracy_score(Y_test, Y_pred)
classReport = classification_report(Y_test, Y_pred, output_dict=True)
for label, metrics in classReport.items():
    if label not in ['accuracy', 'macro avg', 'weighted avg']:
        print(f"Class: {label}")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.4f}")
    else:
        print(f"{label.capitalize()}:")
        if isinstance(metrics, dict):
            for metric, value in metrics.items():
                print(f"  {metric}: {value:.4f}")
        else:
            print(f"  {metrics:.4f}")

In [None]:
data = {
    'min_samples_split': 4,  
    'max_depth': 10,
    'time_taken': timeTaken,
    'memory_used': memoryUsed,
    'accuracy': Accuracy,
    'class_report': classReport
}

saveToCSV('irisSKLearn.csv', data)