# Blueprint Design
## Package Import and Helper Function

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score
from sklearn.feature_selection import SelectPercentile, SelectKBest, f_regression, f_classif
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, Normalizer
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import roc_curve, auc, roc_auc_score, f1_score, plot_roc_curve, confusion_matrix, \
ConfusionMatrixDisplay, brier_score_loss, accuracy_score

from datetime import date
import scorecardpy as sc

# Some configuration of the plots we will create later
%matplotlib inline  
plt.rcParams["figure.figsize"] = (12, 6)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

pd.set_option('display.max_columns', 500)

import shap

pd.options.display.max_columns = None

### Define Parameters

In [None]:
# truncation
truncate_item_price = True
truncate_delivery_days = False
truncate_age = False
cut_age = True

# imputation
numeric_imputer_strategy =  'median' # 'median', 'most_frequent', 'constant' 'mean'
numeric_standard_scaler_mean = True

###  Define Methods and Wrap them

In [None]:
def preprocess_df(df, known_data:bool, truncate_delivery_days:bool, truncate_item_price:bool, truncate_age:bool, cut_age:bool):
    # change object variables to datatype category
    # change numeric variables from float64 to float32 (reduce memory consumption)
    # change feature return to boolean (2 categories)
    # change dates to the datetime datatype 'datetime64[ns]'
    df = transform_columns(df, known_data)
    
    # via (df['delivery_date'] - df['order_date']).dt.days
    df = add_delivery_days(df)

    if truncate_delivery_days:
        print('truncate_delivery_days')
        # via outlier_truncation(df['delivery_days'])
        # # Define upper/lower bound
        # # upper = x.quantile(0.75) + factor*IQR
        # # lower = x.quantile(0.25) - factor*IQR
        df = remove_delivery_days_outliers(df)
    
    # via df['delivery_date'].apply(lambda x: False if pd.isnull(x) else True)
    df = add_delivery_date_missing(df)

    # year<2016 is all 1994, which is suspicious
    # via df['delivery_date'].apply(lambda x: True if x.year < 2016 else False)
    df = add_delivery_date_1994_marker(df)

    # via df.loc[df['delivery_date'].dt.year < 2016,['delivery_days']] = np.nan
    df = set_delivery_date_1994_to_nan(df)
    
    # via df['brand_id'].apply(lambda x: (df['brand_id'] == x).sum())
    df = add_brand_id_count(df)

    # via df['item_id'].apply(lambda x: (df['item_id'] == x).sum())
    df = add_item_id_count(df)

    # set it all to lowercase and correct some spelling error
    # then via df['item_color'].apply(lambda x: (df['item_color'] == x).sum())
    df = add_item_color_count(df)
    
    # a practical summary for retailing size:
    # sizes_dict = {
    #     '84': 'xxs', '104': 's', '110': 's', '116': 's', '122': 'm', '128': 'm',
    #     '134': 'l', '140': 'l', '148': 'xl', '152': 'xl', '164': 'xxl', '170': 'xxl',
    #     '176': 'xxxl', '18': 'xs', '19': 's', '20': 's', '21': 'm', '22': 'm', '23': 'l',
    #     '24':  'xl', '25': 'xs', '26': 's', '27': 's', '28': 'm', '29': 'm',  '30': 'l',
    #     '31': 'l', '32': 'xl', '33': 'xxl', '34': 'xxs', '35': 'xs', '36': 'xs', '36+': 's',
    #     '37': 's', '37+': 's', '38': 's', '38+': 's', '39': 'm', '39+': 'm', '40': 'm',
    #     '40+': 'm', '41': 'm', '41+' : 'm', '42': 'l', '42+': 'l', '43': 'l', '43+': 'l',
    #     '44': 'l', '44+' : 'xl', '45' : 'xl', '45+': 'xl', '46': 'xl', '46+' : 'xl',
    #     '47' : 'xl', '48': 'xl', '49': 'xl', '50': 'xxl', '52': 'xxl', '54': 'xxl',
    #     '56': 'xxl', '58': 'xxl', 0: 'xxs', '1': 'xxs', '2': 'xxs', '2+': 'xxs', '3' : 'xxs',
    #     '3+': 'xs', '4':  'xs', '4+': 'xs', '5': 'xs', '5+':'xs', '6':'s', '6+':'s',
    #     '7':'s', '7+':'m', '8':'m', '8+':'m', '9': 'l', '9+': 'l', '10': 'l', '10+': 'xl',
    #     '11': 'xl', '11+': 'xl', '12': 'xl', '12+': 'xxl', '13': 'xxl', '14': 'xxl',
    #     36: 'xxs', 38: 'xs', 40: 's', 42: 'm', 44: 'l', 46: 'xl', 48: 'xxl',
    #     '3132': 'xxs', '3332': 'xs', '3432': 'xs', '3632': 's', '3832': 'm', '3634': 'l',
    #     '3834': 'xl', '4032': 'xl', '4034': 'xxl', '4232': 'xxxl', '80': 'xs', '85': 's',
    #     '90': 'm', '95': 'l', '100': 'xl', '105': 'xxl'
    # }
    df = convert_item_sizes(df)

    if truncate_item_price:
        print('truncate_price_size')
        # via outlier_truncation(df['item_price'])
        df = truncate_item_price_outliers(df)

    # via df['user_dob'].apply(calculate_age)
    # today = date.today()
    # return today.year - born.year - ((today.month, today.day) < (born.month, born.day))
    df = add_age(df)

    if truncate_age:
        print('truncate_age')
        # via outlier_truncation(df['age'])
        df = truncate_age_outliers(df)
    if cut_age:
        print('cut_age')
        # via df.loc[df['age'] > 95,'age'] = np.nan
        df = cut_age_outliers(df)
    
    # via df['user_dob'].apply(lambda x: False if pd.isnull(x) else True)
    df = add_dob_missing(df)

    # via df['been_member_for'] = (df['order_date']-df['user_reg_date'] ).dt.days
    df = add_been_member_for(df)

    # labels = ['fresh_member', 'new_member', 'member', 'old_member']
    # cut_bins = [-5, 150, 300, 450, 1000]
    # via df['member'] = pd.cut(df['been_member_for'], bins=cut_bins, labels=labels)
    df = add_member_category(df)

    print(df.info())
    return df   

## EDA and Data Preparation
### Load Data and Take a First Glance

In [None]:
%%time
df_known = pd.read_csv('.../BADS_WS2021_known.csv', index_col='order_item_id') 
df_known.head()
# Query some properties of the data
print('Dimensionality of the data is {}'.format(df_known.shape))  # .shape returns a tupel
print('The data set has {} cases.'.format(df_known.shape[0]))     # we can also index the elements of that tupel
print('The total number of elements is {}.'.format(df_known.size))
df_known.info()

### Conversion After Comparison of Final Evaluation

In [None]:
def transform_columns(df_known, known_data:bool):
    # change object variables to datatype category
    df_known['item_size'] = df_known['item_size'].astype('category')
    df_known['item_color'] = df_known['item_color'].astype('category')
    df_known['user_title'] = df_known['user_title'].astype('category')
    df_known['user_state'] = df_known['user_state'].astype('category')
    # change all numeric variables from float64 to float32 to reduce memory consumption
    df_known['item_price'] = df_known['item_price'].astype(np.float32)
    df_known['item_price'] = df_known['item_price'].apply(lambda x:("%.2f" % round(x, 2)))
    df_known['item_price'] = df_known['item_price'].astype(np.float32)
    df_known['brand_id'] = df_known['brand_id'].astype(np.int32)
    df_known['user_id'] = df_known['user_id'].astype(np.int32)
    df_known['item_id'] = df_known['item_id'].astype(np.int32)
    if known_data:
        # since the feature return has only two values, we convert it to boolean
        df_known['return'] = df_known['return'].astype('bool')
    # transform all dates to the datetime datatype
    df_known['order_date'] = df_known['order_date'].astype('datetime64[ns]')
    df_known['delivery_date'] = df_known['delivery_date'].astype('datetime64[ns]')
    df_known['user_dob'] = df_known['user_dob'].astype('datetime64[ns]')
    df_known['user_reg_date'] = df_known['user_reg_date'].astype('datetime64[ns]')
    return df_known

df = transform_columns(df, known_data=True)
df.info()

# <class 'pandas.core.frame.DataFrame'>
# Int64Index: 100000 entries, 1 to 100000
# Data columns (total 13 columns):
#  #   Column         Non-Null Count   Dtype         
# ---  ------         --------------   -----         
#  0   order_date     100000 non-null  datetime64[ns]
#  1   delivery_date  90682 non-null   datetime64[ns]
#  2   item_id        100000 non-null  int32         
#  3   item_size      100000 non-null  category      
#  4   item_color     100000 non-null  category      
#  5   brand_id       100000 non-null  int32         
#  6   item_price     100000 non-null  float32       
#  7   user_id        100000 non-null  int32         
#  8   user_title     100000 non-null  category      
#  9   user_dob       91275 non-null   datetime64[ns]
#  10  user_state     100000 non-null  category      
#  11  user_reg_date  100000 non-null  datetime64[ns]
#  12  return         100000 non-null  bool          
# dtypes: bool(1), category(4), datetime64[ns](4), float32(1), int32(3)
# memory usage: 5.8 MB

### Some Data Description During Exploration

In [None]:
# Describe feature
df['order_date'].describe()

# Count "return" according to "delivery_date"
## Bar plot
df.loc[df["delivery_date"].isna(), ['return']]['return'].value_counts()#.plot(kind="bar")

# Head data of multiple features
df[['delivery_date', 'delivery_date_missing', 'return']].head()

# Cross table
pd.crosstab(df['user_title'], df["return"], normalize='index')

# Correlation between features
df[['delivery_date_missing', 'return']].corr()
# Heatmap
sns.heatmap(df_corr.corr(),
            annot=True);
# Boxplot
df.boxplot(column='delivery_days')
# Violin plot
for col in ['order_age','order_age_combine','order_deliver','reg_order']:
    plt.figure()
    sns.violinplot(x='user_title', y=col, hue='return',
                   split=True, inner="quart",
                   data= df1, subplots=True)
                   
# Select specific data according to specific criterion
df[df['delivery_days'] < 0]

# Categorical feature distribution
# Exluding data type float leaves us with the target variable and both categorical variables
plt.figure(figsize=(12,6))
for i, col in enumerate(df.select_dtypes('category').columns):
    plt.figure(i)
    a = sns.countplot(x=col, data=df)
    a.set_xticklabels(a.get_xticklabels(), rotation=50, ha="right", fontsize=11)  

# Stack count plot
for i, col in enumerate(['order_date_weekend','delivery_date_weekend',
                         'user_reg_date_weekend']):
    plt.figure(i)
    df1.groupby(['return', col]).size().reset_index().pivot(
        columns='return', index=col, values=0).plot(kind='bar',
                                                    stacked=True)


# WoE (Weight of Evidence, just as an example, it will not be done in this section)
bins_been_member_for = sc.woebin(df, y="return", x='been_member_for')

sc.woebin_plot(bins_been_member_for)

# Numerical feature distribution
sns.distplot(df['item_price'])

# Histogram
plt.figure(figsize=(18,12))  # enlarge the figure
# We create one histogram for each numeric variable and illustrate how to set the number of bins
df['item_price'].hist(bins=20)
# or
sns.distplot(df['been_member_for'])



### Feature selection

In [None]:
# Remove correlated features
df = df.drop(columns=['brand_id_frequency', 'brand_id_count']) # negative correlation with 'brand_id_woe'

# Filter function
class filter_binary_target:
    def __init__(self, df, target):
        self.target = target
        self.data_head = df.head()

    def auto_filter_binary_target(self):
        print('Data must be in a clean pandas DataFrame. Categorical variables must be of data type bool or category. Continuous variables must be int64 or float64.')
        data_no_target = df.drop(columns=self.target)
        columns = ['Data Type', 'Metric', 'Score']
        index = data_no_target.columns
        result = pd.DataFrame(index=index, columns=columns)

        for col in data_no_target:
            if data_no_target.dtypes[col] == 'bool' or data_no_target.dtypes[col].name == 'category':
                result.loc[col, 'Data Type'] = "discrete"
                result.loc[col, 'Metric'] = "IV"
                result.loc[col, 'Score'] = self.IV_binary_target(feature=col)

            if data_no_target.dtypes[col] == 'int64' or data_no_target.dtypes[col] == 'float64':
                result.loc[col, 'Data Type'] = "continuous"
                result.loc[col, 'Metric'] = "Fisher"
                result.loc[col, 'Score'] = self.fisher_binary_target(feature=col)

        return result

    def IV_binary_target(self, feature):  # same code as used above
        data = pd.DataFrame()
    
        data['Count'] = df[feature].value_counts()
        data['Bad'] = df.groupby([feature])[self.target].sum()
        data['Good'] = data['Count'] - data['Bad']
    
        data["Distribution Bad"] = data["Bad"] / data["Bad"].sum()
        data["Distribution Good"] = data["Good"] / data["Good"].sum()
    
        data['WOE'] = np.log(data["Distribution Good"] / data["Distribution Bad"])
        data.replace({"WOE": {np.inf: 0, -np.inf: 0}})

        data["IV"] = data["WOE"] * (data["Distribution Good"] - data["Distribution Bad"])

        iv = data["IV"].sum()

        return iv

    def fisher_binary_target(self, feature):
        mu_0 = df.groupby(df[self.target])[feature].mean()[0]
        mu_1 = df.groupby(df[self.target])[feature].mean()[1]
        var_0 = df.groupby(df[self.target])[feature].var()[0]
        var_1 = df.groupby(df[self.target])[feature].var()[1]

        num = abs(mu_0 - mu_1)
        den = (var_0 + var_1) ** 0.5
        score = num/den
    
        return score

    def pearson(self, feature):  # since our target is binary, we actually don't need this. However, if you would like to expand this class, you can use this code
        mean_feature = df[feature].mean()
        mean_target = df[self.target].mean()
        num = ((df[feature] - mean_feature)*(df[self.target] - mean_target)).sum()
        den = (((df[feature] - mean_feature)**2).sum() * ((df[self.target] - mean_target)**2).sum()) ** .5
        rho = num/den
        return rho

## Pipeline construction
### Class definition

In [None]:
# Select only specified columns
class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.columns]
        
# Drop only specified columns, it will be used after completely preprocessing the data before traing model
class DropColumns(BaseEstimator, TransformerMixin):
    
    def __init__(self, drop_list):
        self.drop_list = drop_list
 
    def fit(self, X, y=None):
        return self
 
    def transform(self, X, y=None):
        X = pd.DataFrame(X)
        df = X.drop(list(self.drop_list), axis=1)
        
        # Rename
        df.columns = X_var_labels
        
        return df

# Weight-of-Evidence
class WoETransformer(BaseEstimator, TransformerMixin):
    """Calculate the WoE"""
    def __init__(self, target, feature):
        self.target = target
        self.feature = feature
 
    def fit(self, df, y):
        
        #self.feature = df.drop(['return'],axis = 1).columns.values[0]
        df = pd.concat([df,y],axis=1)
        #count_values
        data = pd.DataFrame()
        data['Count'] = df[self.feature].value_counts()               # Count instances of each category, create row for each
        data['Bad'] = df.groupby([self.feature])[self.target].sum()   # Count y=1 instances of that category
        data['Good'] = data['Count'] - data['Bad']                    # Count y=0 instances of that category
        data = data.sort_values(by=["Count"], ascending=False)
    
        try:
            assert data["Bad"].sum() != 0                               # Check that there are y=1 instances in sample
            assert data["Good"].sum() != 0                              # Check that there are y=0 instances in sample
            assert np.isin(df[self.target].unique(), [0, 1]).all()      # Check that target includes only 0,1 or True,False
        except:
          print("Error: Target must include 2 binary classes.")
          raise     
        
        #distribution
        data["WOE_adj"] = np.log( 
            ((data["Count"] - data["Bad"] + 0.5) / (data["Count"].sum() - data["Bad"].sum())) / 
            ((data["Bad"] + 0.5) / data["Bad"].sum())
            )
        data.replace({"WOE_adj": {np.inf: 0, -np.inf: 0}})
        self.data = data.sort_values(by=["Count"], ascending=False)
        return self

    def transform(self, df):
        df.loc[:, self.feature] = df.loc[:, self.feature].map(self.data["WOE_adj"])
        return df


filter = filter_binary_target(df=df, target="return")

filter.auto_filter_binary_target()

### Preprocessor combinition

In [None]:
woe_pipe = Pipeline([('step_1',WoETransformer(feature = 'item_id', target = 'return')),
                      ('step_2',WoETransformer(feature = 'brand_id', target = 'return')),
                      ('step_3',WoETransformer(feature = 'user_id', target = 'return')),
                      ('step_4',WoETransformer(feature = 'item_size', target = 'return')),
                      ('step_5',WoETransformer(feature = 'item_color', target = 'return')),
                      ('step_6',WoETransformer(feature = 'user_state', target = 'return'))])


std_pipe = Pipeline([('selector', ColumnSelector(['order_age_combine','order_deliver',
                                                 'reg_order', 'item_price'])),
                     ('scaler', StandardScaler())])


preprocessor = FeatureUnion(transformer_list=[('WoE', woe_pipe),
                                              ('Std', std_pipe)])

### Final DataFrame (just an example) and train-test splitting

In [None]:
# Extract target variable and feature matrix 
X = df.drop(['return'], axis=1)
y = df[['return']]

# Change the dependent variable from float32 to bool
y = y.astype('bool')

# Split data into training and test
X_train, X_test, y_train, y_test = train_test_split(X.loc[0:99999,:], y.loc[0:99999,:], test_size=0.3, random_state=888)

# Combine X_train and y_train into dataframe
Xy_train = pd.concat([X_train, y_train], axis = 1)
Xy_test = pd.concat([X_test, y_test], axis = 1)

print("Remember the shape of our data: ")
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, Xy_train.shape, Xy_test.shape)

### Logistic Regression

In [None]:
model_lr = LogisticRegression(max_iter=1000, C=1.0, fit_intercept=True)
lr_param_grid = {
    'preprocessor__num__imputer__metric': ['median', 'mean'],
    'preprocessor__num__scaler__method': ['std', 'minmax'],
    'select__percentile': [5, 10, 25, 50],
    'classifier__C': [0.1, 1.0, 10, 100],
    'classifier__fit_intercept': [True, False]
}

clf_lr = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        # Select features according to the k highest F-score.
        ('select', SelectKBest(score_func=f_regression, k='all')), 
        ('classifier', model_lr)
    ]
)

# gs_lr = GridSearchCV(estimator=clf_xgb, param_grid=lr_param_grid, scoring='roc_auc', cv=5, verbose=0)
# gs_lr.fit(X_train, y_train.values.ravel())

clf_lr.fit(X_train, y_train)

# ROC Curve
