In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#scikitlearn importing
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

In [5]:
#Importing the models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

#Metric
from sklearn.metrics import f1_score as f1
from sklearn.metrics import precision_recall_fscore_support, classification_report

#Pretty Table
from prettytable import PrettyTable

In [3]:
std_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)

dev_df = std_df.loc[:24487,:]
eval_df = std_df.loc[24488:,:]
dev_df.shape, eval_df.shape

((24488, 39), (12245, 39))

In [4]:
dev_df['labels'] = dev_df['CO'].apply(lambda x : 0 if x < 4.5 else 1)
dev_df[['labels', 'CO']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_df['labels'] = dev_df['CO'].apply(lambda x : 0 if x < 4.5 else 1)


Unnamed: 0_level_0,labels,CO
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,1.0764
1,1,10.196
2,0,0.78407
3,0,1.6153
4,0,1.4071


In [9]:
#Defining the models 
degree = 3
rs = 42 
clf_models = [
    RandomForestClassifier(random_state = rs),
    AdaBoostClassifier(random_state = rs), 
    GaussianNB(),
    BernoulliNB(),
    MLPClassifier(random_state = rs,max_iter = 10000),
    SVC(),
    make_pipeline(
        PolynomialFeatures(degree),
        RandomForestClassifier()
    )
]

names = [
    'rf',
    'ada',
    'gaussian',
    'bernoulli',
    'mlp',
    'svc',
    f'poly{degree}+rf',
]

In [10]:
t = PrettyTable()

result = {}



for model, name in zip(clf_models, names):
    print(model)
    X = dev_df.drop(columns = ['CO','labels'])
    y = dev_df['labels']
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = rs, test_size = 0.25, stratify = y)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    result[name] = (model, classification_report(y_test, y_pred))

RandomForestClassifier(random_state=42)
AdaBoostClassifier(random_state=42)
GaussianNB()
BernoulliNB()
MLPClassifier(max_iter=10000, random_state=42)
SVC()
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('randomforestclassifier', RandomForestClassifier())])


In [13]:
for k,(model, report) in result.items():
    print(k)
    print(report)
    print()

rf
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      5413
           1       0.86      0.78      0.82       709

    accuracy                           0.96      6122
   macro avg       0.92      0.88      0.90      6122
weighted avg       0.96      0.96      0.96      6122


ada
              precision    recall  f1-score   support

           0       0.96      0.97      0.97      5413
           1       0.78      0.71      0.75       709

    accuracy                           0.94      6122
   macro avg       0.87      0.84      0.86      6122
weighted avg       0.94      0.94      0.94      6122


gaussian
              precision    recall  f1-score   support

           0       0.99      0.88      0.93      5413
           1       0.49      0.90      0.64       709

    accuracy                           0.88      6122
   macro avg       0.74      0.89      0.78      6122
weighted avg       0.93      0.88      0.89      6122




In [33]:
rf = regression_models[0]
y_hat = rf.predict(X_test)
precision_recall_fscore_support(y_test, y_hat)

(array([0.97274903, 0.86685962]),
 array([0.98288372, 0.80187416]),
 array([0.97779012, 0.83310153]),
 array([5375,  747]))