In [1]:
%load_ext autoreload
%autoreload 2

In [42]:
import pandas as pd
# import lightgbm as lgb
# import xgboost as xgb
# from catboost import CatBoostClassifier, Pool
from metrics import  accuracy
from experiments import profit_scoring
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.metrics import balanced_accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, SelectFromModel, RFE, VarianceThreshold, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import CountVectorizer
from experiments import run_grid_search_cv, apply_transform_to_res

# from catboost import CatBoostClassifier, Pool
import matplotlib.pyplot as plt

from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning

simplefilter("ignore", category=ConvergenceWarning)
simplefilter("ignore", category=RuntimeWarning) 
simplefilter("ignore", category=UserWarning)

# Custom Grid Search

It is needed due to the custom scorer.

In [28]:
cv = 5

model_grid_params = [
    {"model": RandomForestClassifier,
     "params": [{}]},
    {"model": DecisionTreeClassifier,
    "params": [{}]},
    {"model": LogisticRegression,
    "params": [{}]},
    {"model": LinearDiscriminantAnalysis,
    "params": [{}]},
    {"model": QuadraticDiscriminantAnalysis,
    "params": [{}]},
    {"model": SVC,
    "params": [{"kernel": "rbf", "probability": True}]} #{"kernel": "linear", "C": 0.025}, 
    
]
feature_selector_grid_params = [
    {"selector": SelectKBest,
    "params": [{"k": i, "score_func": score_fnc} for i in [3, 5, 10, 20, 30] for score_fnc in [f_classif, mutual_info_classif]],
    },
    {"selector": SelectFromModel,
    "params": [{"estimator": i, "max_features": max_features} for i in [LogisticRegression(), RandomForestClassifier()] for max_features in [None, 3, 5, 10, 20, 30]],#LogisticRegression(penalty="l1", solver="saga"),
    },
    

# Ineffective methods 
#     {"selector": VarianceThreshold,
#     "params": [{"threshold": th} for th in [0.0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1]]}
    
# Too computationally complex
#     {"selector": RFE,
#     "params": [{"estimator": i, "n_features_to_select": max_features} for i in [LogisticRegression(), RandomForestClassifier()] for max_features in [None, 3, 5]],
#     }

]


# Load Data

In [29]:
RANDOM_STATE = 42

x_train_path = "./../../data/x_train.txt"
y_train_path = "./../../data/y_train.txt"

x_test_path = "./../../data/x_test.txt"


X_train = pd.read_csv(x_train_path, delim_whitespace=True, header=None)
y_train = pd.read_csv(y_train_path, delim_whitespace=True, header=None)
X_test = pd.read_csv(x_test_path, delim_whitespace=True, header=None)

In [30]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,-2.619773,-2.619533,-1.19935,-1.083335,-1.00091,-0.366967,-2.164037,-1.210001,-0.658311,-1.489539,...,10.849925,10.343346,10.717519,7.709295,5.894554,12.416573,6.765269,16.243907,7.209524,8.082021
1,-1.415579,-1.782544,-2.88027,-1.958863,1.159968,0.27303,-1.628728,-0.175813,-0.916857,-0.570166,...,11.489417,5.195818,3.494627,5.529154,10.517576,15.697333,11.324938,12.18767,12.283861,5.032285
2,-2.745092,-1.382945,-1.626015,-1.28256,-0.663146,0.052349,-2.403322,-0.765073,-0.394354,-0.806624,...,13.934934,9.267515,4.705604,6.642557,14.658934,8.130767,7.194487,11.939354,11.65362,5.942778
3,0.618998,0.455364,-0.115081,0.64904,-0.862207,2.308504,0.526114,-1.094852,1.088656,-0.48121,...,12.021328,3.852231,11.059702,7.527268,7.25312,9.791136,6.089743,10.752796,5.778888,10.366363
4,-0.070694,-0.550509,-0.565556,-0.693065,-0.573089,-0.395862,0.00317,-0.981609,-0.505775,-0.75843,...,7.537788,11.229665,11.318915,6.622256,12.557882,5.52036,5.397359,13.152269,10.684779,9.816471


In [31]:
y_train.head()

Unnamed: 0,0
0,0
1,0
2,1
3,1
4,1


In [32]:
y_train.shape

(5000, 1)

In [33]:
(y_train == 1).sum()

0    2496
dtype: int64

In [34]:
X_test.shape

(5000, 500)

In [35]:
X_train = X_train
y_train = y_train
X_test = X_test

## Baseline

In [None]:
baseline_selector_grid_search = [
    {"selector": SelectKBest,
    "params": [{"k": "all"}, {"k": 10}],
    },
]
baseline_models_grid_search = [
     {"model": DummyClassifier,
     "params": [{}]},
     {"model": RandomForestClassifier,
     "params": [{}]},
]
baseline_results_dict = run_grid_search_cv(
    baseline_selector_grid_search, 
    baseline_models_grid_search, 
    X_train, 
    y_train, 
    profit_scoring
)
baseline_results = apply_transform_to_res(baseline_results_dict, np.mean)
baseline_results_df = pd.DataFrame(baseline_results).T
baseline_results_df.style.background_gradient(cmap="Greens", low=0.0, high=1.0, axis=None)

Baselines are critical part of every project. Due to the non-typical scoring metric a few baseline version got prepared that involve/or not feature selection part. 

We see that DummyClassifier does not perform well in case of keeping all the features nor reducing them to 10.

On the other hand the RandomForest reach 60% acc when using all features and went down to ~51 % when using just 10.

Scoring metric needs to be adjusted to incorporate monetary conterpart.

## Experiments (no feature scaling)

In [37]:
results_dict = run_grid_search_cv(feature_selector_grid_params, model_grid_params, X_train, y_train, profit_scoring)

Running on: RandomForestClassifier() and SelectKBest(k=3)
Running on: DecisionTreeClassifier() and SelectKBest(k=3)
Running on: LogisticRegression() and SelectKBest(k=3)
Running on: LinearDiscriminantAnalysis() and SelectKBest(k=3)
Running on: QuadraticDiscriminantAnalysis() and SelectKBest(k=3)
Running on: SVC(probability=True) and SelectKBest(k=3)
Running on: RandomForestClassifier() and SelectKBest(k=3, score_func=<function mutual_info_classif at 0x12f03c1f0>)
Running on: DecisionTreeClassifier() and SelectKBest(k=3, score_func=<function mutual_info_classif at 0x12f03c1f0>)
Running on: LogisticRegression() and SelectKBest(k=3, score_func=<function mutual_info_classif at 0x12f03c1f0>)
Running on: LinearDiscriminantAnalysis() and SelectKBest(k=3, score_func=<function mutual_info_classif at 0x12f03c1f0>)
Running on: QuadraticDiscriminantAnalysis() and SelectKBest(k=3, score_func=<function mutual_info_classif at 0x12f03c1f0>)
Running on: SVC(probability=True) and SelectKBest(k=3, score_

In [38]:
avg_results_dict = apply_transform_to_res(results_dict, np.mean)
avg_df = pd.DataFrame(avg_results_dict).T
avg_df.columns = ["RF", "DT", "LogReg", "LDA", "QDA", "SVC-rbf"]
avg_df.style.background_gradient(cmap="Greens", low=0.0, high=1.0, axis=None)

Unnamed: 0,RF,DT,LogReg,LDA,QDA,SVC-rbf
SelectKBest(k=3),878.0,860.0,900.0,902.0,902.0,896.0
"SelectKBest(k=3, score_func=)",948.0,880.0,928.0,928.0,1102.0,822.0
SelectKBest(k=5),806.0,804.0,812.0,810.0,828.0,846.0
"SelectKBest(k=5, score_func=)",950.0,786.0,828.0,824.0,1038.0,762.0
SelectKBest(),670.0,572.0,630.0,626.0,680.0,652.0
SelectKBest(score_func=),856.0,610.0,650.0,650.0,978.0,808.0
SelectKBest(k=20),242.0,198.0,202.0,200.0,258.0,250.0
"SelectKBest(k=20, score_func=)",496.0,212.0,268.0,266.0,564.0,370.0
SelectKBest(k=30),-168.0,-230.0,-178.0,-180.0,-162.0,-150.0
"SelectKBest(k=30, score_func=)",102.0,-174.0,-116.0,-114.0,142.0,-90.0


In [39]:
std_results_dict = apply_transform_to_res(results_dict, np.std)
std_df = pd.DataFrame(std_results_dict).T
std_df.columns = ["RF", "DT", "LogReg", "LDA", "QDA", "SVC-rbf"]

std_df.style.background_gradient(cmap="Reds", axis=None)

Unnamed: 0,RF,DT,LogReg,LDA,QDA,SVC-rbf
SelectKBest(k=3),76.524506,48.989795,58.651513,53.065997,45.343136,34.985711
"SelectKBest(k=3, score_func=)",84.947042,32.863353,96.829747,96.829747,121.720992,483.255626
SelectKBest(k=5),35.552778,78.383672,27.12932,28.284271,34.292856,31.368774
"SelectKBest(k=5, score_func=)",73.212021,62.801274,82.316463,84.047606,124.963995,490.322343
SelectKBest(),50.990195,38.678159,56.568542,56.780278,44.271887,58.103356
SelectKBest(score_func=),89.576783,81.731267,41.952354,43.817805,99.075729,127.49902
SelectKBest(k=20),70.540768,43.081318,43.081318,43.358967,84.947042,30.983867
"SelectKBest(k=20, score_func=)",71.721684,60.464866,30.594117,32.0,99.719607,70.142712
SelectKBest(k=30),111.96428,66.932802,29.257478,35.213634,91.08238,28.284271
"SelectKBest(k=30, score_func=)",45.343136,72.828566,32.0,31.368774,74.404301,106.395489


## Experiments (StandardScaling)

In [None]:
results_with_scaling_dict = run_grid_search_cv(feature_selector_grid_params, model_grid_params, X_train, y_train, profit_scoring, scaler=StandardScaler())

Running on: RandomForestClassifier() and SelectKBest(k=3)
Running on: DecisionTreeClassifier() and SelectKBest(k=3)
Running on: LogisticRegression() and SelectKBest(k=3)
Running on: LinearDiscriminantAnalysis() and SelectKBest(k=3)
Running on: QuadraticDiscriminantAnalysis() and SelectKBest(k=3)
Running on: SVC(probability=True) and SelectKBest(k=3)
Running on: RandomForestClassifier() and SelectKBest(k=3, score_func=<function mutual_info_classif at 0x12f03c1f0>)


In [None]:
avg_results_with_scaling_dict = apply_transform_to_res(results_with_scaling_dict, np.mean)
avg_with_scaling_df = pd.DataFrame(avg_results_with_scaling_dict).T
avg_with_scaling_df.columns = ["RF", "DT", "LogReg", "LDA", "QDA", "SVC-rbf"]
avg_with_scaling_df.style.background_gradient(cmap="Greens", low=0.0, high=1.0, axis=None)

## Results Interpretation

### Best models

In [72]:
avg_df.max()

RF         0.6884
DT         0.5932
LogReg     0.5196
LDA        0.5186
QDA        0.7404
SVC-rbf    0.7170
dtype: float64

In [73]:
avg_df[avg_df["QDA"] == avg_df.max()["QDA"]]

Unnamed: 0,RF,DT,LogReg,LDA,QDA,SVC-rbf
"SelectFromModel(estimator=RandomForestClassifier(), max_features=10)",0.6884,0.5932,0.508,0.5054,0.7404,0.717


### Predictions generation

In [43]:
# todo: code that generates the predictions