In [21]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn import metrics
import os
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [22]:
def random_forest(X_train, y_train, X_valid, y_valid, X_test, y_test, show_plot=True):

    sig_clf = RandomForestClassifier(n_estimators=100, random_state=42)
    sig_clf.fit(X=X_train, y=y_train)
    y_train_preds = sig_clf.predict_proba(X_train)
    train_log_loss = log_loss(y_train, y_train_preds)
    print("Training Log Loss:", train_log_loss)
    y_valid_preds = sig_clf.predict_proba(X_valid)
    valid_log_loss = log_loss(y_valid, y_valid_preds)
    print("Validation Log Loss:", valid_log_loss)
    y_test_preds = sig_clf.predict_proba(X_test)
    test_log_loss = log_loss(y_test, y_test_preds)
    print("Testing Log Loss:", test_log_loss)
    predict_y = sig_clf.predict(X_test)

    classification_report = metrics.classification_report(y_test, predict_y)


    return train_log_loss, valid_log_loss, test_log_loss, sig_clf

In [23]:
def model_score(X_test, y_test, model):
    return model.score(X_test, y_test)

In [24]:
import time

def pred_time(X_test, model_name):
    start_pred = time.time()
    y_pred = model_name.predict(X_test)
    end_pred = time.time()
    return end_pred - start_pred

In [26]:
data_path = "C:/Users/DilshodbekMX/PycharmProjects/Data"
data_name = os.listdir(data_path)
label_df = pd.read_csv("C:/Users/DilshodbekMX/PycharmProjects/MLFoundation/Normalization/MinMaxScaling.csv")
label_column = label_df["Label"]
data_list = []
for data in data_name:
    df = pd.read_csv((os.path.join(data_path,data)))
    print(data.replace(".csv",""))
    if "Label" in df.columns:
        X = df.drop("Label", axis=1)  # Features
        y = df["Label"]
    else:
        X = df
        y = label_column
    X, X_test, y, y_test = train_test_split(X, y, stratify=y, train_size=0.7, random_state=15)
    X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    stratify = y,
    train_size = 0.5/0.7,
    random_state = 15
    )
    train_loss, valid_loss, test_loss, model = random_forest(X_train, y_train, X_valid, y_valid, X_test, y_test)
    score = model_score(X_test, y_test, model)
    speed = pred_time(X_test, model)
    data_dict = {
        "Selection": data.replace(".csv",""),
        "Train Loss": train_loss,
        "Validation Loss": valid_loss,
        "Test Loss": test_loss,
        "Accuracy":score,
        "Speed":speed,
    }
    print(data_dict)
    data_list.append(data_dict)
    

BackwardSelection
Training Log Loss: 0.0002503201071647483
Validation Log Loss: 0.0006786294985726149
Testing Log Loss: 0.0006675403430009606
{'Selection': 'BackwardSelection', 'Train Loss': 0.0002503201071647483, 'Validation Loss': 0.0006786294985726149, 'Test Loss': 0.0006675403430009606, 'Accuracy': 1.0, 'Speed': 0.2711052894592285}
ChiSquaredTest
Training Log Loss: 0.00030585055036420504
Validation Log Loss: 0.000810875467811253
Testing Log Loss: 0.000750188739186443
{'Selection': 'ChiSquaredTest', 'Train Loss': 0.00030585055036420504, 'Validation Loss': 0.000810875467811253, 'Test Loss': 0.000750188739186443, 'Accuracy': 1.0, 'Speed': 0.28826284408569336}
FishersScore
Training Log Loss: 0.0002455824753859319
Validation Log Loss: 0.000661188295946884
Testing Log Loss: 0.0006215407035759834
{'Selection': 'FishersScore', 'Train Loss': 0.0002455824753859319, 'Validation Loss': 0.000661188295946884, 'Test Loss': 0.0006215407035759834, 'Accuracy': 1.0, 'Speed': 0.28333449363708496}
Forw

In [28]:
MinMaxBasedSelection = pd.DataFrame(data_list)
(MinMaxBasedSelection)

Unnamed: 0,Selection,Train Loss,Validation Loss,Test Loss,Accuracy,Speed
0,BackwardSelection,0.00025,0.000679,0.000668,1.0,0.271105
1,ChiSquaredTest,0.000306,0.000811,0.00075,1.0,0.288263
2,FishersScore,0.000246,0.000661,0.000622,1.0,0.283334
3,ForwardSelection,0.000229,0.000631,0.000623,1.0,0.281347
4,InformationGain,0.000271,0.000621,0.00063,1.0,0.264895
5,LASSOSelection,0.000185,0.000461,0.00047,1.0,0.265055
6,MeanAbsoluteDifference,0.000231,0.000676,0.000638,1.0,0.303638
7,MissingValueRatio,0.000267,0.000744,0.000727,1.0,0.291878
8,PearsonCorrelation,0.000248,0.000672,0.000666,1.0,0.327628
9,RandomForestImportance,0.000238,0.00067,0.000629,1.0,0.301013


In [29]:
# Sort by Age in ascending order
MinMaxBasedSelection = MinMaxBasedSelection.sort_values(by='Speed', ascending=True)
(MinMaxBasedSelection)

Unnamed: 0,Selection,Train Loss,Validation Loss,Test Loss,Accuracy,Speed
4,InformationGain,0.000271,0.000621,0.00063,1.0,0.264895
5,LASSOSelection,0.000185,0.000461,0.00047,1.0,0.265055
0,BackwardSelection,0.00025,0.000679,0.000668,1.0,0.271105
3,ForwardSelection,0.000229,0.000631,0.000623,1.0,0.281347
2,FishersScore,0.000246,0.000661,0.000622,1.0,0.283334
10,RecursiveFeatureElimination,0.000231,0.000662,0.00064,1.0,0.287474
1,ChiSquaredTest,0.000306,0.000811,0.00075,1.0,0.288263
7,MissingValueRatio,0.000267,0.000744,0.000727,1.0,0.291878
13,VarianceThreshold,0.000126,0.000341,0.000322,1.0,0.29203
9,RandomForestImportance,0.000238,0.00067,0.000629,1.0,0.301013
