In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **IMPORTS**

In [2]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import ast
from datetime import datetime

# **ALL MODULAR FUNCTIONS**

In [6]:
1. # loading and preprocessing the data
def load_and_preprocess_data(file_paths):
    dfs = []
    for file_path in file_paths:
        df = pd.read_parquet(file_path)
        dfs.append(df)
    df = pd.concat(dfs, ignore_index=True)
    
    #Drop address
    df = df.drop(columns=['address'])
    
    #Handle churn due to fraud as a boolean
    df['churn_due_to_fraud'] = df['churn_due_to_fraud'].astype(int)
    
    #Transform date columns into datetime objects.
    df['date'] = pd.to_datetime(df['date'])
    df['date_of_birth'] = pd.to_datetime(df['date_of_birth'])
    
    return df

# who are the churners?

def create_target_variable(df):
    df['churn'] = 0
    # churn = 1 for customer last interaction 2023-01-01
    filtered_df = df[df['date'] < '2023-01-01']
    idx = filtered_df.groupby('customer_id')['date'].idxmax()
    df.loc[idx, 'churn'] = 1
    return df


# some features
def feature_engineering(df):
    # Time-based features
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['quarter'] = df['date'].dt.quarter
    df['year'] = df['date'].dt.year
    df['days_since_start'] = (df['date'] - df.groupby('customer_id')['date'].transform('min')).dt.days
    df['days_since_last_activity'] = (df.groupby('customer_id')['date'].transform('max') - df['date']).dt.days
    df['age'] = ((df['date'] - df['date_of_birth']).dt.days / 365.25).astype(int)
    
    # Interaction Features
    df['total_transfer_volume'] = df['bank_transfer_in_volume'] + df['bank_transfer_out_volume']
    df['crypto_transfer_volume'] = df['crypto_in_volume'] + df['crypto_out_volume']
    df['volume_ratio'] = (df['crypto_transfer_volume'] + 1) / (df['total_transfer_volume'] + 1)

  # Touchpoint features
    # Convert touchpoints string to list
    df['touchpoints'] = df['touchpoints'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

    # parse the list and get all type of touchpoints
    unique_touchpoints = set()
    for touchpoints_list in df['touchpoints']:
        if isinstance(touchpoints_list, list):
            unique_touchpoints.update(touchpoints_list) 
            
    # touchpoint_email = 2, touchpoint_sms = 1 etc per row. 
    for touchpoint in unique_touchpoints:
        df[f'touchpoint_{touchpoint}'] = df['touchpoints'].apply(lambda x: x.count(touchpoint) if isinstance(x,list) else 0)

    #csat features
    def extract_csat_scores(csat_dict, channel):
      if isinstance(csat_dict, str):
          try:
              csat_dict = ast.literal_eval(csat_dict)
          except (ValueError, SyntaxError):
              return np.nan
      if isinstance(csat_dict,dict) and channel in csat_dict:
         return csat_dict.get(channel)
      return np.nan

    unique_channels = set()
    for csat_dict in df['csat_scores']:
      if isinstance(csat_dict, str):
         try:
            csat_dict = ast.literal_eval(csat_dict)
         except (ValueError, SyntaxError):
            continue
      if isinstance(csat_dict,dict):
         unique_channels.update(csat_dict.keys())

    # csat score for diff channels.
    for channel in unique_channels:
        df[f'csat_{channel}'] = df['csat_scores'].apply(lambda x : extract_csat_scores(x, channel))
    
    #Drop nested columns
    df=df.drop(columns=['touchpoints','csat_scores'])

    return df


# preprocessing pipeline
def get_preprocessing_pipeline(numerical_features, categorical_features):
    # Numerical features pipeline
    numerical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    #Categorical Features pipeline
    categorical_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('numerical', numerical_pipeline, numerical_features),
            ('categorical', categorical_pipeline, categorical_features)
        ],
        remainder = 'passthrough'
    )
    
    return preprocessor


# hyperparams tuning
def get_model(params):
    return xgb.XGBClassifier(**params, objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)



def get_model(params):
    return xgb.XGBClassifier(**params, objective='binary:logistic', eval_metric='logloss', use_label_encoder=False)




#2. Model Training and Evaluation
def train_models(df, preprocessor, numerical_features, categorical_features):
  
  df_train = df[df['date'].dt.year < 2023]
  df_train = df_train.sort_values(by='date')
  X_train_all = df_train.drop(columns=['churn','customer_id', 'name', 'date', 'date_of_birth', 'Id'])
  y_train_all = df_train['churn']
  
  # Preprocess data
  X_train_all = preprocessor.fit_transform(X_train_all)

  # Split training and validation data
  X_train_all, X_val_all, y_train_all, y_val_all = train_test_split(X_train_all, y_train_all, test_size=0.2, random_state=42, shuffle = False)

  xgb_params = {
      'n_estimators': 500,
      'learning_rate': 0.05,
      'max_depth': 5,
      'device': 'cuda',
      'tree_method': 'hist',
      'subsample': 0.8,
      'colsample_bytree': 0.8,
      'random_state': 42,
      'early_stopping_rounds': 10
  }
  
  model_all = get_model(xgb_params)

  model_all.fit(X_train_all, y_train_all, eval_set=[(X_val_all, y_val_all)], verbose=50)

 # return model_all
  return model_all, _, _

def predict_with_orchestration(df, preprocessor, model_all):
  X_test = df.drop(columns=['customer_id', 'name', 'date', 'date_of_birth', 'Id'])
  X_test = preprocessor.transform(X_test)
  
  preds = np.zeros(X_test.shape[0])

  for i, row in df.iterrows():
      # predict proba expects (n_sample, n_feat) and returns (prob(0), prob(1))
      preds[i]=model_all.predict_proba(X_test[i].reshape(1,-1))[:,1]

  return pred


def evaluate_model(y_true, y_pred):
    logloss = log_loss(y_true, y_pred)
    return logloss


# 3. Prediction and Submission
def create_submission_file(test_df, predictions, file_path):
    submission_df = pd.DataFrame({'Id': test_df['Id'], 'churn': predictions})
    submission_df.to_csv(file_path, index=False)


# --- Main Execution ---
if __name__ == "__main__":
    train_file_paths = [
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2008.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2009.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2010.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2011.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2012.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2013.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2014.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2015.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2016.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2017.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2018.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2019.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2020.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2021.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2022.parquet',
        '/kaggle/input/neo-bank-non-sub-churn-prediction/train_2023.parquet',
    ]
    
    test_file_path = '/kaggle/input/neo-bank-non-sub-churn-prediction/test.parquet'
    # Load Data
    train_df = load_and_preprocess_data(train_file_paths)
    test_df = load_and_preprocess_data([test_file_path])

    #Target Variable
    train_df = create_target_variable(train_df)

    # Feature Engineering
    train_df = feature_engineering(train_df)
    test_df = feature_engineering(test_df)
    
    # Define Numerical and Categorical features
    numerical_features = ['interest_rate', 'atm_transfer_in', 'atm_transfer_out', 'bank_transfer_in',
                          'bank_transfer_out', 'crypto_in', 'crypto_out', 'bank_transfer_in_volume',
                          'bank_transfer_out_volume', 'crypto_in_volume', 'crypto_out_volume', 'complaints',
                           'tenure', 'days_since_start', 'days_since_last_activity', 'age','total_transfer_volume',
                            'crypto_transfer_volume','volume_ratio']
    
    categorical_features = ['country', 'from_competitor', 'job', 'day_of_week', 'month', 'quarter', 'year']


    for touchpoint in ['app', 'email', 'phone','chat']:
        if f'touchpoint_{touchpoint}' in train_df.columns:
          numerical_features.append(f'touchpoint_{touchpoint}')
    
    for channel in ['email', 'phone', 'app','chat']:
       if f'csat_{channel}' in train_df.columns:
         numerical_features.append(f'csat_{channel}')

    #Create Preprocessing pipeline
    preprocessor = get_preprocessing_pipeline(numerical_features, categorical_features)

    # Model Training
    model_all = train_models(train_df, preprocessor, numerical_features, categorical_features)
    
    # Prediction on test data
    test_predictions = predict_with_orchestration(test_df, preprocessor, model_all)


    # Create Submission
    create_submission_file(test_df, test_predictions, 'submission.csv')
    
    print("Submission file created.")
