### Mount Drive

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


### Download Dataset from Kaggle

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [None]:
!kaggle datasets download -d raddar/amex-data-integer-dtypes-parquet-format

Downloading amex-data-integer-dtypes-parquet-format.zip to /content
100% 4.07G/4.07G [03:26<00:00, 20.4MB/s]
100% 4.07G/4.07G [03:26<00:00, 21.1MB/s]


In [None]:
!unzip /content/amex-data-integer-dtypes-parquet-format.zip

Archive:  /content/amex-data-integer-dtypes-parquet-format.zip
  inflating: test.parquet            
  inflating: train.parquet           


In [None]:
!cp '/content/test.parquet' '/content/drive/MyDrive/fyp/ml/data'
!cp '/content/train.parquet' '/content/drive/MyDrive/fyp/ml/data'

### Install Requirements

In [2]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp38-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


### Import Libraries

In [3]:
import gc
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold

%matplotlib inline

### Config

In [37]:
class config:
    data_dir = '/content/drive/MyDrive/ml/data'
    output_dir = '/content/drive/MyDrive/ml/output'
    num_transactions = 2
    seed = 42

    model = "knn" # svm, knn, cbc, lgbm, ensemble
    scaler = "standard" # robust

    # LGBM
    n_estimators=300

    # feature seelction
    select_feature = True
    threshold = 0.3
    plot = False

    #kfold
    kfold = False
    splits = 2

### Functions

In [38]:
def load_train_dataset(data_dir):
  df_train_data = pd.read_parquet(os.path.join(data_dir, 'train.parquet'))
  df_train_labels = pd.read_csv(os.path.join(data_dir, 'train_labels.csv'))

  df_train_data = df_train_data.groupby('customer_ID').tail(config.num_transactions)
  df_train_data = df_train_data.set_index('customer_ID', drop=True).sort_index()

  df_train_labels = df_train_labels.set_index('customer_ID', drop=True).sort_index()
  df_train = pd.merge(df_train_data, df_train_labels, left_index=True, right_index=True)

  return df_train

def load_test_dataset(data_dir):
  df_test_data = pd.read_parquet(os.path.join(data_dir,'test.parquet'))

  df_test_data = df_test_data.groupby('customer_ID').tail(1)
  df_test = df_test_data.set_index('customer_ID', drop=True).sort_index()

  return df_test

In [39]:
def feature_selection():
  # load dataset
  df_train = load_train_dataset(config.data_dir)
  
  # preprocess dataset
  train_X, train_y, scaler, imputer = preprocess_train_dataset(df_train)

  df = pd.concat([train_X, pd.DataFrame(data = list(train_y), columns = ["target"])], axis=1)

  # get correlation matrix
  cont_variables_correlation = df.corr()

  # filter matrix based on the threshold
  df_filtered = cont_variables_correlation[cont_variables_correlation["target"] > config.threshold]

  # features
  features = df_filtered.index

  if config.plot:
    #plot the heatmap showing calculated correlations
    plt.subplots(figsize=(50, 50))
    plt.title('Pearson Correlation of continous features')
    ax = sns.heatmap(cont_variables_correlation, 
                    annot=True, 
                    linewidths=.5, 
                    cmap="YlGnBu",
                    square=True
                    );
  
  return features

In [40]:
def preprocess_train_dataset(df_train, features = None, select_feature=False):
  # whether to do feature selection
  if select_feature:
    features_array = features.to_numpy()
    df_train = df_train[features_array]
  else:
    # drop unwanted columns
    drop_cols = ['S_2']  
    df_train.drop(drop_cols, inplace=True, axis=1)

  train_y = df_train['target']
  train_X = df_train.drop('target', axis=1)

  col_names = train_X.columns

  # handling missing values
  imputer = SimpleImputer()
  train_X = pd.DataFrame(imputer.fit_transform(train_X))
  train_X.columns = col_names

  # scaling
  if config.scaler == "standard":
    scaler = StandardScaler()
    train_X_scaled = scaler.fit_transform(train_X)
  elif config.scaler == "robust":
    scaler = RobustScaler()
    train_X_scaled = scaler.fit_transform(train_X)

  # return dataset as df
  train_X = pd.DataFrame(train_X_scaled, index=train_X.index, columns=train_X.columns)

  return train_X, train_y, scaler, imputer

In [41]:
def preprocess_test_data(df_test, imputer, scaler, features = None, select_feature=False):
  cust_id = df_test.index

  # whether to do feature selection
  if select_feature:
    new_features = features.delete(-1)
    features_array = new_features.to_numpy()
    df_test = df_test[features_array]
  else:
    drop_cols = ['S_2']  
    df_test.drop(drop_cols, inplace=True, axis=1)

  # handling missing values
  col_names_test = df_test.columns
  df_test = pd.DataFrame(imputer.transform(df_test))
  df_test.columns = col_names_test

  # scaling
  df_test_scaled = scaler.transform(df_test)
  df_test = pd.DataFrame(df_test_scaled, index=df_test.index, columns=df_test.columns)
  return cust_id, df_test

In [42]:
def kfold(x, y, model):
  #kf = KFold(n_splits = config.splits)
  kf = StratifiedKFold(n_splits=config.splits)
  best_model = None
  best_accuracy = 0.0

  #for train_index, val_index in kf.split(x):
  for fold, (train_index, val_index) in enumerate(kf.split(x, y)):
      X_train = x.iloc[train_index]
      X_valid = x.iloc[val_index]
      Y_train = y.iloc[train_index]
      Y_valid = y.iloc[val_index]
      
      if model == "lgbm":
        model = LGBMClassifier(n_estimators=config.n_estimators, random_seed=config.seed)
      elif model == "cbc":
        model = CatBoostClassifier(silent=True, random_seed=config.seed)
      elif model == "knn":
        model = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)
      elif model == "svm":
        model = svm.SVC(decision_function_shape="ovo")

      model.fit(X_train, Y_train)
      y_pred = model.predict(X_valid)
          
      accuracy = accuracy_score(Y_valid, np.round(y_pred))
      if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model
    
  print(f"Best Accuracy of {model} is {str(best_accuracy)}")
  return best_model

In [43]:
def SVM(X,y):
    svm = svm.SVC(decision_function_shape="ovo")
    svm.fit(X,y)
    return svm

In [44]:
def KNN(X,y):
    knn = KNeighborsClassifier(n_neighbors=5, metric="minkowski", p=2)
    knn.fit(X,y)
    return knn

In [45]:
def LGBM(X, y):
  lgbm = LGBMClassifier(n_estimators=config.n_estimators, random_seed=config.seed)
  lgbm.fit(X, y)
  return lgbm

In [46]:
def CatBoost(X, y):
  cbc = CatBoostClassifier(silent=True, random_seed=config.seed)
  cbc.fit(X, y)
  return cbc

In [47]:
def train():
  # load dataset
  df_train = load_train_dataset(config.data_dir)
  
  # preprocess dataset
  features = None
  if config.select_feature:
    features = feature_selection()
  train_X, train_y, scaler, imputer = preprocess_train_dataset(df_train, features, config.select_feature)

  model_1 = None
  model_2 = None

  # model training
  if config.kfold:
    if config.model == "lgbm":
      model_1 = kfold(train_X, train_y, "lgbm")
    elif config.model == "cbc":
      model_1 = kfold(train_X, train_y, "cbc")
    elif config.model == "svm":
      model_1 = kfold(train_X, train_y, "svm")
    elif config.model == "knn":
      model_1 = kfold(train_X, train_y, "knn")
    elif config.model == "ensemble":
      model_1 = kfold(train_X, train_y, "lgbm")
      model_2 = kfold(train_X, train_y, "cbc")
  else:
    if config.model == "lgbm":
      model_1 = LGBM(train_X, train_y)
    elif config.model == "cbc":
      model_1 = CatBoost(train_X, train_y)
    elif config.model == "svm":
      model_1 = SVM(train_X, train_y)
    elif config.model == "knn":
      model_1 = KNN(train_X, train_y)
    elif config.model == "ensemble":
      model_1 = LGBM(train_X, train_y)
      model_2 = CatBoost(train_X, train_y)

  return model_1, model_2, scaler, imputer, features

In [52]:
def test(model_1, model_2, scaler, imputer, features):
  output_model_1 = None
  output_model_2 = None

  # load dataset
  df_test = load_test_dataset(config.data_dir)

  # preprocess dataset
  cust_id, df_test = preprocess_test_data(df_test, imputer, scaler, features, config.select_feature)

  # get model predictions
  if config.model == "ensemble":
    preds_probs_model_1 = model_1.predict_proba(df_test)
    preds_probs_model_2 = model_2.predict_proba(df_test)
  else:
    preds_probs_model_1 = model_1.predict_proba(df_test)

  # create dataframes for outputs
  if config.model == "ensemble":
    output_model_1 = pd.DataFrame(preds_probs_model_1, dtype='float64')[1]
    output_model_2 = pd.DataFrame(preds_probs_model_2, dtype='float64')[1]
  else:
    output_model_1 = pd.DataFrame(preds_probs_model_1, dtype='float64')[1]

  return cust_id, output_model_1, output_model_2

In [49]:
def write_output(cust_id, output_model_1, output_model_2):
  if config.model == "ensemble":
    df_model_1 = pd.DataFrame({'customer_ID': cust_id, 'prediction': output_model_1})
    df_model_2 = pd.DataFrame({'customer_ID': cust_id, 'prediction': output_model_2})

    # write individual outputs to csv
    df_model_1.to_csv(os.path.join(config.output_dir, 'lgbm_submission.csv'), index=False)
    df_model_2.to_csv(os.path.join(config.output_dir, 'cbc_submission.csv'), index=False)

    # merge both outputs to calculate average
    combined = df_model_2.merge(df_model_1, left_on='customer_ID', right_on='customer_ID', suffixes=('_cbc', '_lgbm'))
    combined['prediction'] = combined.mean(numeric_only=True, axis=1)
    combined = combined[["customer_ID", "prediction"]]
    combined.to_csv(os.path.join(config.output_dir, 'avg_submission.csv'), index=False)

  else:
    df_model_1 = pd.DataFrame({'customer_ID': cust_id, 'prediction': output_model_1})

    # write individual outputs to csv
    df_model_1.to_csv(os.path.join(config.output_dir, f'{config.model}_submission.csv'), index=False)

### Train and Evaluate

In [50]:
def main():
  model_1, model_2, scaler, imputer, features = train()
  cust_id, output_model_1, output_model_2 = test(model_1, model_2, scaler, imputer, features)
  write_output(cust_id, output_model_1, output_model_2)

In [None]:
main()