In [2]:
# General
import pandas as pd
import numpy as np

# Decision tree, imputers
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Decision tree plotting
from sklearn import tree # for text representation
# from matplotlib import pyplot as plt
import matplotlib.pyplot as plt

# RandomForest model
from sklearn.ensemble import RandomForestClassifier
import seaborn as sns

# XGBoost model
import xgboost as xgb
from xgboost import XGBClassifier

# Custom functions import
import seg_functions as seg


#### Data Reading 

In [3]:
data_read_st = pd.read_sas("data_all_perf_st.sas7bdat")
data_read_mw = pd.read_sas("data_all_perf_mw.sas7bdat")

#### Data Preparation

In [4]:
#odstranění nepotřebných sloupců (původní produktové proměnné)
data_mw = data_read_mw.drop(["C_100100","C_100110","C_100120","C_100140","C_100160"], axis=1)
data_st = data_read_st.drop(["C_100100","C_100110","C_100120","C_100140","C_100160"], axis=1)

In [5]:
data_st = data_st.drop(["CBSMessageId","Exclusion"],axis=1)
data_mw = data_mw.drop(["CBSMessageId","Exclusion"],axis=1)

##### Bytes to strings

In [6]:
#převod bytes to strings
columns_to_convert = ["C_600570","C_660570","C_500300","C_500600","C_501200","C_502400","performance"]
def convert_bytes_to_str(x):
    return x.decode('utf-8') if isinstance(x, bytes) else x
data_st[columns_to_convert] = data_st[columns_to_convert].map(convert_bytes_to_str)
data_mw[columns_to_convert] = data_mw[columns_to_convert].map(convert_bytes_to_str)

##### Devide data - full dataset and segmentation dataset

In [7]:
data_mw_full = data_mw.copy()
data_st_full = data_st.copy()
data_mw = data_mw.loc[data_mw["performance"].isin(["B","G"])]
data_st = data_st.loc[data_st["performance"].isin(["B","G"])]

##### Fill NaNs

In [10]:
#performance proměnné
#missing to "O" in performance variables
cat_perf_cols = ["C_500300","C_501200","C_500600","C_502400"]
#for variable in cat_perf_cols:
#    data_st[variable] = data_st[variable].fillna('O')    
#    data_mw[variable] = data_mw[variable].fillna('O')

#proměnná C_600570 - missing to '00000'
#data_st["C_600570"] = data_st["C_600570"].fillna('00000')
#data_mw["C_600570"] = data_mw["C_600570"].fillna('00000')

#numerické proměnné NaN -> -999
numeric_columns = data_mw.select_dtypes(include=['int64', 'float64']).columns
data_mw[numeric_columns] = data_mw[numeric_columns].fillna(-999)
data_st[numeric_columns] = data_st[numeric_columns].fillna(-999)

#### Encoding

In [12]:
#vytvoření kopie datasetu, dále hlavní dataset 'df'
df = data_mw.copy()

In [13]:
#kódování kategorických proměnných (bez performance)
label_encoders = {}
for col in df.columns:
    if df[col].dtype == 'object' and col not in cat_perf_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

In [14]:
#kódování performance proměnných
mapping = {'B': 3, 'I': 2, 'G': 1, 'O': 0}
for col in cat_perf_cols:
    df[col] = df[col].map(mapping)

#### Train, test split

In [15]:
train,test = train_test_split(df, test_size = 0.3, random_state = 42)

#backups
test_backup = test.copy()
train_backup = train.copy()

#### Prepartion of data for models

In [44]:
data_to_train = train

##### Segment 1 determination

In [26]:
performance_cols = ["C_111640","C_111660","C_111620","C_111600","C_111500","C_500300","C_501200","C_502400","C_751200","C_750600","C_750100","C_752400","C_750300","C_106920"]#"C_500600"]
#droping performance cols:
data_to_train = train.drop(performance_cols, axis=1)

##### Segment 2 determination

In [30]:
#droping segment 1 and relevant variables
determination_var_s1 = "C_500600"
condition_s1 = (train[determination_var_s1] == 0)|(train[determination_var_s1] == 1)
data_to_train = train[condition_s1].copy()
data_to_train = data_to_train.drop(performance_cols, axis = 1).drop(determination_var_s1, axis = 1)

##### Segment 3 and 4 determination

In [None]:
determination_var_s2 = "C_102800"
condition_s1 = (train[determination_var_s1] == 0)|(train[determination_var_s1] == 1)
condition_s2 = (train[determination_var_s2] <= 1)
data_to_train = train[condition_s1 & condition_s2].copy() #vyhození segmentu 1 a 2
cols_to_drop = performance_cols + [determination_var_s1,determination_var_s2] + ['C_906370'] #performance_cols + relevantní proměnné pro segment 1 a 2 + cokoliv dalšího chci dropnout
data_to_train = data_to_train.drop(cols_to_drop, axis = 1)

### Models

##### Decision Tree

In [40]:
# X and y split depends on the dataset used
X = data_to_train.drop('performance', axis = 1)
y = data_to_train['performance']

# options:
#testing only 1 variable
#var_to_test = "C_114100"
#X = pd.DataFrame(X[var_to_test])

In [41]:
clf = DecisionTreeClassifier(criterion = 'entropy', max_depth = 3, random_state = 42, min_samples_leaf = 5_000, max_leaf_nodes=4)
clf.fit(X, y)

Visualisation

In [None]:
seg.tree_viz_text(clf) #text

In [None]:
seg.tree_viz_plot(clf, class_names = ["B","G"], w = 30, h = 20) #plot

##### Random Forest

In [45]:
X = data_to_train.drop('performance', axis = 1)
y = data_to_train['performance']

In [46]:
rf = RandomForestClassifier(criterion = 'entropy', max_depth = 3, random_state = 42, min_samples_leaf = 5_000, n_estimators = 500)
rf.fit(X, y)

feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf.feature_importances_
})

Output

In [None]:
feature_importance.sort_values(by = 'importance', ascending=False).reset_index(drop=True)

##### XGBoost

In [45]:
X = data_to_train.drop('performance', axis = 1)
y = data_to_train['performance']

In [None]:
model = XGBClassifier(objective='binary:logistic',
    eval_metric='logloss',
    max_depth=3,
    seed=42, n_estimators=500)
model.fit(X,y)

In [47]:
feature_importances = model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)


Output

In [None]:
importance_df