In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#scikitlearn importing
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

%matplotlib inline

plt.rcParams["figure.figsize"] = (10, 6)

In [32]:
#Importing the models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

#Metric
from sklearn.metrics import f1_score as f1
from sklearn.metrics import precision_recall_fscore_support

#Pretty Table
from prettytable import PrettyTable

In [18]:
std_df = pd.read_csv('processed_dataset/std_dataset.csv', index_col = 0)

dev_df = std_df.loc[:24487,:]
eval_df = std_df.loc[24488:,:]
dev_df.shape, eval_df.shape

((24488, 39), (12245, 39))

In [19]:
dev_df['labels'] = dev_df['CO'].apply(lambda x : 0 if x < 4.5 else 1)
dev_df[['labels', 'CO']].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_df['labels'] = dev_df['CO'].apply(lambda x : 0 if x < 4.5 else 1)


Unnamed: 0_level_0,labels,CO
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0,1.0764
1,1,10.196
2,0,0.78407
3,0,1.6153
4,0,1.4071


In [28]:
#Defining the models 
degree = 3
rs = 42 
clf_models = [
    RandomForestClassifier(random_state = rs),
    AdaBoostClassifier(random_state = rs), 
    GaussianNB(),
    BernoulliNB(),
    MLPClassifier(random_state = rs,max_iter = 10000),
    SVC(),
    make_pipeline(
        PolynomialFeatures(degree),
        RandomForestClassifier()
    ),
    make_pipeline(
        PolynomialFeatures(degree),
        MLPClassifier()
    ),
    make_pipeline(
        PolynomialFeatures(degree),
        BernoulliNB()
    )
]

names = [
    'rf',
    'ada',
    'gaussian',
    'bernoulli',
    'mlp',
    'svc',
    f'poly{degree}+rf',
    f'poly{degree}+mlp',
    f'poly{degree}+bn',
]

In [29]:
t = PrettyTable()



t.field_names = ['model', 'MSE']

for model, name in zip(clf_models, names):
    print(model)
    X = dev_df.drop(columns = ['CO','labels'])
    y = dev_df['labels']
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = rs, test_size = 0.25)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    t.add_row([name, f1(y_test, y_pred)])

RandomForestClassifier(random_state=42)
AdaBoostClassifier(random_state=42)
GaussianNB()
BernoulliNB()
MLPClassifier(max_iter=10000, random_state=42)
SVC()
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('randomforestclassifier', RandomForestClassifier())])
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('mlpclassifier', MLPClassifier())])
Pipeline(steps=[('polynomialfeatures', PolynomialFeatures(degree=3)),
                ('bernoullinb', BernoulliNB())])


In [30]:
print(t)

+-----------+--------------------+
|   model   |        MSE         |
+-----------+--------------------+
|     rf    | 0.8331015299026425 |
|    ada    | 0.7815771109560362 |
|  gaussian | 0.6399253731343283 |
| bernoulli | 0.5194305502116199 |
|    mlp    | 0.7793103448275862 |
|    svc    | 0.7874015748031498 |
|  poly3+rf | 0.827538247566064  |
| poly3+mlp | 0.7788331071913162 |
|  poly3+bn | 0.5159817351598173 |
+-----------+--------------------+


In [33]:
rf = regression_models[0]
y_hat = rf.predict(X_test)
precision_recall_fscore_support(y_test, y_hat)

(array([0.97274903, 0.86685962]),
 array([0.98288372, 0.80187416]),
 array([0.97779012, 0.83310153]),
 array([5375,  747]))