In [1]:
import re
import time
import datetime

import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from models import *

import os
import sys
import warnings
warnings.filterwarnings("ignore")
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" # Also affect subprocesse
 
from dataset import engine, RFM
from models import CLFSwitcher

In [2]:
rfm = RFM(engine)
rfm.get_dataset()
# rfm.get_cat_cols()
# rfm.get_num_cols()

Unnamed: 0,age,gender,state,per_capita_income,yearly_income,total_debt,fico_score,num_credit_cards,day,month,year,segment
0,76,Male,NC,17850.0,21867.0,21103.0,759.0,2,14,8,2003,Loyal Customers
1,32,Male,OH,12101.0,24668.0,22338.0,756.0,1,25,5,2018,About to Sleep
2,43,Female,SC,24314.0,49577.0,142314.0,694.0,3,5,9,2003,Champions
3,50,Male,IL,22578.0,46039.0,79738.0,672.0,4,22,6,2013,Need Attention
4,52,Male,MI,18487.0,37686.0,41173.0,739.0,4,30,3,2017,Champions
...,...,...,...,...,...,...,...,...,...,...,...,...
1562,29,Male,NC,16642.0,33933.0,5329.0,698.0,6,23,10,2013,Hibernating
1563,37,Male,MI,32423.0,66111.0,98102.0,771.0,5,22,12,2009,Hibernating
1564,37,Male,CA,21402.0,43638.0,104052.0,627.0,1,18,8,2011,Promising
1565,31,Male,TX,15737.0,32086.0,53595.0,747.0,4,26,8,2009,Hibernating


In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, OneHotEncoder, MinMaxScaler


ct = ColumnTransformer(
	[
		("num_preprocess", MinMaxScaler(), rfm.get_num_cols()),
		("cat_preprocess", OrdinalEncoder(), rfm.get_cat_cols())
	]
)
df = ct.fit_transform(rfm.get_dataset())

def ct_transform(ct, df):
	cat = list(rfm.get_cat_cols())
	num = list(rfm.get_num_cols())
	df = pd.DataFrame(df, columns=list(num) + list(cat))
	return df

def ct_inverse(ct, df):
	cat = list(rfm.get_cat_cols())
	num = list(rfm.get_num_cols())
	
	df = pd.DataFrame(df, columns=list(num) + list(cat))
	df[num] = ct.named_transformers_['num_preprocess'].inverse_transform(df[num])
	df[cat] = ct.named_transformers_['cat_preprocess'].inverse_transform(df[cat])
	return df
# ct_inverse(ct, df)
transformed = ct_transform(ct, df)
# transformed
X = transformed.loc[:, ~transformed.columns.isin(['segment'])]
y = transformed.loc[:, transformed.columns.isin(['segment'])]
X, y
classes = {k[0]:y.value_counts().max(numeric_only=True) for k,v in dict(y.value_counts()).items()}

In [11]:
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KDTree
# from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier, OutputCodeClassifier

from sklearn.model_selection import GridSearchCV

from imblearn.over_sampling  import SMOTE
from imblearn.pipeline import Pipeline


pipeline = Pipeline([
	('smote', SMOTE(sampling_strategy=classes)),
    ('clf', CLFSwitcher()),
])
randomness = np.arange(123, 124, 1)
parameters = [
    # {	
	# 	'clf__estimator__random_state': randomness,
    #     'clf__estimator': [SGDClassifier()], # SVM if hinge loss / logreg if log loss
    #     'clf__estimator__penalty': ('l2', 'elasticnet', 'l1'),
    #     'clf__estimator__max_iter': [int(i) for i in [1e+03]],
	# 	'clf__estimator__tol': [1e-05],
	# 	# 'clf__estimator__alpha': [1e-00, 1e-02, 1e-04],
    #     'clf__estimator__loss': ['hinge', 'log_loss','perceptron', 'squared_hinge', 'modified_huber'],
    # },
	{
		'clf__estimator__random_state': randomness,
		'clf__estimator': [XGBRFClassifier(), XGBClassifier()],
		'clf__estimator__max_depth': [5],
		'clf__estimator__learning_rate': [1e-05], 
		'clf__estimator__n_estimators': [int(i) for i in [1e+03]]
	},
	# {	
	# 	'clf__estimator__random_state': randomness,
    #     'clf__estimator': [MultinomialNB()],
    #     'clf__estimator__alpha': (1e-2, 1e-3, 1e-1),
    # },
	# {
	# 	# 'clf__estimator__random_state': randomness,
	# 	'clf__estimator': [OneVsRestClassifier(SGDClassifier())],
	# 	'clf__estimator__estimator__penalty': ('l2', 'elasticnet', 'l1'),
	# 	'clf__estimator__estimator__max_iter': [int(i) for i in [1e+03]],
	# 	'clf__estimator__estimator__tol': [1e-05],
	# 	# 'clf__estimator__alpha': [1e-00, 1e-02, 1e-04],
	# 	'clf__estimator__estimator__loss': ['hinge', 'log_loss','perceptron', 'squared_hinge', 'modified_huber'],
	# }
]


grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=12, return_train_score=True, verbose=0)
grid_search.fit(X, y)
grid_search

In [13]:
print(grid_search.best_score_)
best_estimator = grid_search.best_estimator_[-1]
print(best_estimator.class_report(X, y))
print(best_estimator.feature_importance())

0.2291731954986671
              precision    recall  f1-score   support

         0.0       0.26      0.73      0.38        83
         1.0       0.60      0.38      0.46       204
         2.0       0.27      0.63      0.38        57
         3.0       0.50      0.43      0.46       171
         4.0       0.62      0.54      0.57       366
         5.0       0.53      0.42      0.47       308
         6.0       0.45      0.38      0.41        79
         7.0       0.71      0.90      0.79        51
         8.0       0.56      0.37      0.44       208
         9.0       0.38      0.93      0.54        40

    accuracy                           0.49      1567
   macro avg       0.49      0.57      0.49      1567
weighted avg       0.54      0.49      0.49      1567

             features  significance
8                year      0.335067
0                 age      0.109720
7               month      0.104768
5    num_credit_cards      0.089921
10              state      0.065688
9     

In [None]:
OneVsRestClassifier(SGDClassifier()).__class__.__name__

'OneVsRestClassifier'