In [124]:
from IPython.display import display

import pandas as pd
import numpy as np

In [125]:
# Configuration
import logging
import sys
from multiprocessing import cpu_count

# Configure logging level
logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

# Number of cpu cores used
n_jobs = cpu_count()

In [126]:
# Load Data

import pandas as pd

# Read the CSV file while skipping column 15
x_train = pd.read_csv('/kaggle/input/datathon-cleaned/inter-uni-datathon-2024-nsw/train.csv', usecols=range(23))

y_train = pd.read_csv('/kaggle/input/datathon-cleaned/inter-uni-datathon-2024-nsw/train.csv', usecols=[23])

x_test = pd.read_csv('/kaggle/input/datathon-cleaned/inter-uni-datathon-2024-nsw/test.csv', usecols=range(23))

submission = pd.read_csv('/kaggle/input/datathon-cleaned/inter-uni-datathon-2024-nsw/sample_submission.csv', index_col=0)

display(x_train.head())
display(y_train.head())

display(x_test.head())

display(submission.head())

Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,...,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure
0,8765,70,37,Female,Professional,Bachelor,Widowed,3,28884.43,14610.61,...,258.14,M006,Withdrawal,Adelaide,mobile,-31.840233,145.612793,jon44@disposable.com,False,113
1,9645,3386,34,Male,Student,High School,Married,4,54919.07,39169.49,...,34.94,M002,Withdrawal,Canberra,mobile,-37.0201,144.9646,emilyreese@gmail.com,False,104
2,1145,2971,25,Male,Unemployed,Master,Married,2,74728.57,55873.76,...,323.82,M008,Purchase,Brisbane,mobile,-31.840233,145.612793,fordevan@gmail.com,False,105
3,15308,2925,25,Male,Professional,High School,Married,3,55712.62,34963.1256,...,12.6711,M001,Purchase,Darwin,mobile,-37.0201,144.9646,kathleenlewis@tempmail.com,False,70
4,14967,2339,38,Male,Professional,High School,Single,4,53004.7,17004.3978,...,444.8925,M001,Withdrawal,MLB,tablet,-37.0201,144.9646,kristinawhite@gmail.com,False,27


Unnamed: 0,IsFraud
0,1
1,1
2,0
3,1
4,0


Unnamed: 0,TransactionNumber,UserID,Age,Gender,Occupation,EducationLevel,MaritalStatus,NumDependents,Income,Expenditure,...,TransactionAmount,MerchantID,TransactionType,TransactionLocation,DeviceType,Latitude,Longitude,EmailDomain,Terrorism,UserTenure
0,11854,963,35,man,Student,Bachelor,Single,4,53733.41,29296.02,...,225.64,M005,Payment,Darwin,Desktop,-37.0201,144.9646,chapmangabriel@outlook.com,True,39
1,2647,1693,23,Male,Professional,Master,Single,2,54856.77,34628.31,...,658.1,M003,Purchase,Darwin,Desktop,-37.0201,144.9646,sjones@gmail.com,True,65
2,5945,4383,44,Male,Student,Bachelor,Single,2,59011.72,33312.46,...,133.59,M004,Purchase,Adelaide,mob,-30.000233,136.209152,woodmaria@yahoo.com,False,95
3,6798,1350,40,Male,Student,High School,Married,3,128795.4,67049.0,...,6.74,M008,Withdrawal,Canberra,Mobile,-37.0201,144.9646,rthornton@gmail.com,False,85
4,12985,4145,18,Male,Professional,Bachelor,Married,3,44506.03,22856.31,...,15.67,M008,Transfer,Perth,Tablet,-20.917574,142.702789,daniel61@outlook.com,True,102


Unnamed: 0_level_0,IsFraud
TransactionNumber,Unnamed: 1_level_1
1,0
2,0
3,1
4,1
5,0


In [127]:
# EDA

import pandas as pd
from sklearn.impute import SimpleImputer

def preprocess(df):
    # Fix gender
    gender_mapping = {'he': 'Male', 'she': 'Female', 'fem': 'Female', 'woman': 'Female', 'man': 'Male', 'isnotmale': 'Female', 'isnotfemale': 'Male'}
    df['Gender'] = df['Gender'].replace(gender_mapping)
    
    # Map occupation to numerical values
    df['Occupation'] = df['Occupation'].map({'Professional': 0, 'Student': 1, 'Unemployed': 2, 'Retired': 3})
    
    # Map education to numerical values
    df['EducationLevel'] = df['EducationLevel'].map({'Bachelor': 0, 'High School': 1, 'Master': 2, 'PhD': 3})
    
    # Map marital status to numerical values 
    df['MaritalStatus'] = df['MaritalStatus'].map({'Widowed': 0, 'Married': 1, 'Single': 2, 'Divorced': 3})
    
    # Fix age values - take abs of first 2 digit
    df['Age'] = df['Age'].abs().astype(str).str[:2].astype(int)
    
    # Fix DeviceType
    df['DeviceType'] = df['DeviceType'].replace({
        'mob': 'Mobile', 'galaxys7': 'Mobile', 'iphone 15': 'Mobile', 'smartphone': 'Mobile', 'android': 'Mobile',
        'desktop': 'Desktop', 'laptop': 'Desktop', 'tablet': 'Tablet', 'ipad': 'Tablet'
    })

    # Convert DeviceType to numerical values, replace missing values with 0
    df['DeviceType'] = df['DeviceType'].map({'Mobile': 1, 'Desktop': 2, 'Tablet': 3})
    df['DeviceType'] = df['DeviceType'].fillna(0)  # Handle missing values specifically for DeviceType
    
    # Fix Location
    df['TransactionLocation'] = df['TransactionLocation'].replace({
        'Cbr': 'Canberra', 'Melb': 'Melbourne', 'Mel': 'Melbourne', 'MLB': 'Melbourne', 'BNE': 'Brisbane', 
        'Melburn': 'Melbourne', 'adl': 'Adelaide', 'Hbt': 'Hobart', 'SYD': 'Sydney', 'Drw': 'Darwin', 
        'Pth': 'Perth', 'HBT': 'Hobart', 'Bne': 'Brisbane', 'DRW': 'Darwin', 'Adl': 'Adelaide', 
        'CBR': 'Canberra', 'PTH': 'Perth', 'Syd': 'Sydney'
    })

    # Map TransactionLocation to numerical values
    location_mapping = {
        'Adelaide': 1, 'Canberra': 2, 'Brisbane': 3, 'Darwin': 4, 'Melbourne': 5, 
        'Sydney': 6, 'Hobart': 7, 'Adelaide Sydney': 8, 'Perth': 9
    }
    df['TransactionLocation'] = df['TransactionLocation'].map(location_mapping).fillna(0).astype(int)
    
    # Handle known invalid formats first
    df['TransactionTime'] = df['TransactionTime'].replace({
        r'(\d{2})/(\d{2})/(\d{4})': r'\1:\2:00',  # 04/23/2003 -> 04:23:00
        r'(\d{2})/(\d{2})/(\d{2})': r'\1:\2:\3'   # 04/38/42 -> 04:38:42
    }, regex=True)

    # Function to convert different time formats into 24-hour format
    def convert_to_24hr_format(time_val):
        try:
            # Handle AM/PM times (e.g., '8:23:18 AM', '5:51:01 PM')
            return pd.to_datetime(time_val, format='%I:%M:%S %p').strftime('%H:%M:%S')
        except (ValueError, TypeError):
            try:
                # Handle already 24-hour formatted times (e.g., '08:23:18', '17:51:01')
                return pd.to_datetime(time_val, format='%H:%M:%S').strftime('%H:%M:%S')
            except (ValueError, TypeError):
                # If time cannot be parsed, return the original value
                return time_val

    # Apply the conversion function to each entry in the column
    df['TransactionTime'] = df['TransactionTime'].apply(convert_to_24hr_format)
    
    # Convert time to categorical periods
    def categorize_time(time_str):
        try:
            time = pd.to_datetime(time_str, format='%H:%M:%S').time()
            if time < pd.to_datetime('06:00:00').time():
                return 4  # Late Night
            elif time < pd.to_datetime('12:00:00').time():
                return 1  # Morning
            elif time < pd.to_datetime('18:00:00').time():
                return 2  # Afternoon
            else:
                return 3  # Evening
        except (ValueError, TypeError):
            return 0  # Default to Morning if time cannot be parsed

    df['TransactionTime'] = df['TransactionTime'].apply(categorize_time)
    
    # Map TransactionType
    df['TransactionType'] = df['TransactionType'].map({'Payment': 0, 'Purchase': 1, 'Withdrawal': 2, 'Transfer': 3})

    # Convert MerchantID to its last digit
    df['MerchantID'] = df['MerchantID'].str.extract('(\d+)').astype(int)
    
    # Gender mapping to 0 and 1
    df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1})
    
    # Map Terrorism to 1 for True and 0 for False
    df['Terrorism'] = df['Terrorism'].map({True: 1, False: 0})

    # Fix TransactionDate: strip dashes and convert to integer, invalid dates treated as 0
    def process_transaction_date(date_str):
        try:
            # Convert valid date in format 'YYYY-MM-DD' to an integer by stripping the dashes
            return int(date_str.replace('-', ''))
        except (ValueError, AttributeError):
            # Handle invalid date formats and missing values
            return 0

    # Apply the function to the TransactionDate column
    df['TransactionDate'] = df['TransactionDate'].apply(process_transaction_date)

    
    # Email Domain extraction 
    df['EmailDomain'] = df['EmailDomain'].str.extract('(@.*)')
    
    # Map EmailDomain to numerical values
    email_domain_mapping = {
    '@gmail.com': 1, '@outlook.com': 2, '@yahoo.com': 3,
    '@securemail.com': 4, '@tempmail.com': 5, '@disposable.com': 6
    }
    

    # Extract the domain from the email and map it, default to 0 for others
    df['EmailDomain'] = df['EmailDomain'].apply(lambda x: email_domain_mapping.get(x, 0))
    
    # Impute missing values in remaining columns with the mean strategy
    imputer = SimpleImputer(strategy='mean')
    df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)


    return df

# Example usage
x_train = preprocess(x_train)
x_test = preprocess(x_test)

# Display the first few rows of the processed DataFrames
print(x_train.head())
print(x_test.head())

x_train.to_csv('x_train_processed.csv', index=False)
x_test.to_csv('x_test_processed.csv', index=False)

   TransactionNumber  UserID   Age  Gender  Occupation  EducationLevel  \
0             8765.0    70.0  37.0     0.0         0.0             0.0   
1             9645.0  3386.0  34.0     1.0         1.0             1.0   
2             1145.0  2971.0  25.0     1.0         2.0             2.0   
3            15308.0  2925.0  25.0     1.0         0.0             1.0   
4            14967.0  2339.0  38.0     1.0         0.0             1.0   

   MaritalStatus  NumDependents    Income  Expenditure  ...  \
0            0.0            3.0  28884.43   14610.6100  ...   
1            1.0            4.0  54919.07   39169.4900  ...   
2            1.0            2.0  74728.57   55873.7600  ...   
3            1.0            3.0  55712.62   34963.1256  ...   
4            2.0            4.0  53004.70   17004.3978  ...   

   TransactionAmount  MerchantID  TransactionType  TransactionLocation  \
0           258.1400         6.0              2.0                  1.0   
1            34.9400        

In [128]:
# #PCA - Not suitable accuracy was lowered 
# from sklearn.decomposition import PCA
# from sklearn.preprocessing import StandardScaler
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import accuracy_score

# # Step 1: Standardize the data
# scaler = StandardScaler()
# x_train_scaled = scaler.fit_transform(x_train)

# # Step 2: Apply PCA
# pca = PCA(n_components=0.95)  # Retain 95% variance
# x_train_pca = pca.fit_transform(x_train_scaled)

# # Step 3: Train a classifier (Random Forest) on original and PCA-reduced data
# # Split the data into training and validation sets for both versions
# x_train_orig, x_val_orig, y_train_orig, y_val_orig = train_test_split(x_train, y_train, test_size=0.2, random_state=42)
# x_train_pca_split, x_val_pca, y_train_pca, y_val_pca = train_test_split(x_train_pca, y_train, test_size=0.2, random_state=42)

# # Random Forest on original data
# clf_orig = RandomForestClassifier(random_state=42)
# clf_orig.fit(x_train_orig, y_train_orig)
# y_pred_orig = clf_orig.predict(x_val_orig)
# accuracy_orig = accuracy_score(y_val_orig, y_pred_orig)

# # Random Forest on PCA-reduced data
# clf_pca = RandomForestClassifier(random_state=42)
# clf_pca.fit(x_train_pca_split, y_train_pca)
# y_pred_pca = clf_pca.predict(x_val_pca)
# accuracy_pca = accuracy_score(y_val_pca, y_pred_pca)

# print(f"Accuracy on original data: {accuracy_orig:.4f}")
# print(f"Accuracy on PCA-reduced data: {accuracy_pca:.4f}")


In [129]:
# # Random Forrest test - finding the best value 


# import pandas as pd
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, log_loss, roc_auc_score, f1_score

# def eval_model(model, x_train, y_train, x_val, y_val):
#     model.fit(x_train, y_train.values.ravel())  # Ensure y_train is 1D
#     y_val_pred = model.predict(x_val)
    
#     # Convert y_val to Series if it's a DataFrame
#     if isinstance(y_val, pd.DataFrame):
#         y_val = y_val.iloc[:, 0]
    
#     # Calculate metrics
#     auc = roc_auc_score(y_val, y_val_pred, multi_class='ovr') if len(y_val.unique()) > 2 else roc_auc_score(y_val, y_val_pred)
#     f1 = f1_score(y_val, y_val_pred, average='weighted')
#     ll = log_loss(y_val, model.predict_proba(x_val))
    
#     return ll, auc, f1

# def random_forest_grid_search(x_train, y_train, x_test, test_size=0.2, random_state=42):
#     # Split the training data into training and validation sets
#     x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(
#         x_train, y_train, test_size=test_size, random_state=random_state
#     )

#     # Convert y_train_split and y_val_split to 1D if they are DataFrames
#     if isinstance(y_train_split, pd.DataFrame):
#         y_train_split = y_train_split.iloc[:, 0]
#     if isinstance(y_val_split, pd.DataFrame):
#         y_val_split = y_val_split.iloc[:, 0]
    
#     # Define the parameter grid
#     param_grid = {
#         'n_estimators': [50, 200, 500],
#         'max_depth': [1, 3, 6, 10]
#     }

#     # Create GridSearchCV object
#     grid_search = GridSearchCV(
#         estimator=RandomForestClassifier(random_state=43),
#         param_grid=param_grid,
#         cv=3,
#         scoring='f1_weighted',
#         n_jobs=-1,
#         verbose=2
#     )

#     # Fit the model on the training data
#     grid_search.fit(x_train_split, y_train_split.values.ravel())  # Ensure y_train_split is 1D
    
#     # Print the best parameters and best score
#     print(f"Best parameters found: {grid_search.best_params_}")
#     print(f"Best cross-validation score: {grid_search.best_score_}")

#     # Test the model on the validation set
#     best_model = grid_search.best_estimator_
#     ll, auc, f1 = eval_model(best_model, x_train_split, y_train_split, x_val_split, y_val_split)

#     print(f"Validation Log Loss: {ll:.4f}")
#     print(f"Validation AUC: {auc:.4f}")
#     print(f"Validation F1 Score: {f1:.4f}")

#     # Generate predictions on the test data
#     y_test_pred = best_model.predict(x_test)
#     y_test_pred_df = pd.DataFrame(y_test_pred, index=x_test.index, columns=['Prediction'])

#     # Print a classification report on test data predictions
#     if 'y_test' in globals():
#         print("Classification Report on Test Data:\n", classification_report(y_val_split, y_test_pred))
#     else:
#         print("y_test is not provided for classification report.")

#     # Return predictions for further use
#     return y_test_pred_df

# y_test_pred_df = random_forest_grid_search(x_train, y_train, x_test)

# print("Test Predictions:\n", y_test_pred_df)


In [130]:
# # Using n_estimators=200, max_depth=10

# import pandas as pd
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import classification_report, log_loss, roc_auc_score, f1_score

# def eval_model(model, x_train, y_train, x_val, y_val):
#     model.fit(x_train, y_train.values.ravel())  # Ensure y_train is 1D
#     y_val_pred = model.predict(x_val)
    
#     # Convert y_val to Series if it's a DataFrame
#     if isinstance(y_val, pd.DataFrame):
#         y_val = y_val.iloc[:, 0]
    
#     # Calculate metrics
#     auc = roc_auc_score(y_val, y_val_pred, multi_class='ovr') if len(y_val.unique()) > 2 else roc_auc_score(y_val, y_val_pred)
#     f1 = f1_score(y_val, y_val_pred, average='weighted')
#     ll = log_loss(y_val, model.predict_proba(x_val))
    
#     return ll, auc, f1

# def random_forest_evaluation(x_train, y_train, x_test, test_size=0.2, random_state=42):
#     # Separate the TransactionNumber for later use
#     transaction_numbers = x_test['TransactionNumber']
    
#     # Drop TransactionNumber for training and prediction
#     x_train_clean = x_train.drop(columns=['TransactionNumber'], errors='ignore')
#     x_test_clean = x_test.drop(columns=['TransactionNumber'], errors='ignore')

#     # Split the training data into training and validation sets
#     x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(
#         x_train_clean, y_train, test_size=test_size, random_state=random_state
#     )

#     # Convert y_train_split and y_val_split to 1D if they are DataFrames
#     if isinstance(y_train_split, pd.DataFrame):
#         y_train_split = y_train_split.iloc[:, 0]
#     if isinstance(y_val_split, pd.DataFrame):
#         y_val_split = y_val_split.iloc[:, 0]
    
#     model = RandomForestClassifier(n_estimators=200, max_depth=10, random_state=43, min_samples_split=10)
    
#     # Fit the model on the training data
#     model.fit(x_train_split, y_train_split.values.ravel())  # Ensure y_train_split is 1D
    
#     # Evaluate the model on the validation set
#     ll, auc, f1 = eval_model(model, x_train_split, y_train_split, x_val_split, y_val_split)

#     print(f"Validation Log Loss: {ll:.4f}")
#     print(f"Validation AUC: {auc:.4f}")
#     print(f"Validation F1 Score: {f1:.4f}")

#     # Generate predictions on the test data
#     y_test_pred = model.predict(x_test_clean)

#     # Include TransactionNumber in the output and ensure it is an integer
#     y_test_pred_df = pd.DataFrame({
#         'TransactionNumber': transaction_numbers.astype(int),  # Convert to integer
#         'IsFraud': y_test_pred
#     })

#     y_test_pred_df.to_csv('predictions.csv', index=False)

#     return y_test_pred_df

# y_test_pred_df = random_forest_evaluation(x_train, y_train, x_test)

# print("Test Predictions:\n", y_test_pred_df.head())

In [131]:
# xg boost implementation (best implementation)

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, log_loss, roc_auc_score, f1_score

def xgboost_evaluation(x_train, y_train, x_test, test_size=0.2, random_state=42):
    # Separate the TransactionNumber for later use
    transaction_numbers = x_test['TransactionNumber']
    
    # Drop TransactionNumber for training and prediction
    x_train_clean = x_train.drop(columns=['TransactionNumber'], errors='ignore')
    x_test_clean = x_test.drop(columns=['TransactionNumber'], errors='ignore')

    # Split the training data into training and validation sets
    x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(
        x_train_clean, y_train, test_size=test_size, random_state=random_state
    )

    # Convert y_train_split and y_val_split to 1D if they are DataFrames
    if isinstance(y_train_split, pd.DataFrame):
        y_train_split = y_train_split.iloc[:, 0]
    if isinstance(y_val_split, pd.DataFrame):
        y_val_split = y_val_split.iloc[:, 0]

    # Create the XGBoost model
    model = xgb.XGBClassifier(
        objective='binary:logistic',
        eval_metric='logloss',
        use_label_encoder=False,
        n_estimators=1000,
        max_depth=10,
        learning_rate=0.1,
        random_state=random_state
    )
    
    # Fit the model on the training data
    model.fit(x_train_split, y_train_split.values.ravel())  # Ensure y_train_split is 1D

    # Evaluate the model on the validation set
    y_val_pred = model.predict(x_val_split)
    y_val_pred_proba = model.predict_proba(x_val_split)[:, 1]
    auc = roc_auc_score(y_val_split, y_val_pred_proba)
    f1 = f1_score(y_val_split, y_val_pred, average='weighted')
    ll = log_loss(y_val_split, y_val_pred_proba)

    print(f"Validation Log Loss: {ll:.4f}")
    print(f"Validation AUC: {auc:.4f}")
    print(f"Validation F1 Score: {f1:.4f}")

    # Generate predictions on the test data
    y_test_pred = model.predict(x_test_clean)
    
    # Include TransactionNumber in the output and ensure it is an integer
    y_test_pred_df = pd.DataFrame({
        'TransactionNumber': transaction_numbers.astype(int),  # Convert to integer
        'IsFraud': y_test_pred
    })

    # Save predictions to a CSV file
    y_test_pred_df.to_csv('predday2.csv', index=False)

    return y_test_pred_df

y_test_pred_df = xgboost_evaluation(x_train, y_train, x_test)

print("Test Predictions:\n", y_test_pred_df.head())

In [132]:
# # hypertuning with xg boost

# from sklearn.model_selection import GridSearchCV
# import xgboost as xgb

# def xgboost_hyperparameter_tuning(x_train, y_train, x_test, test_size=0.2, random_state=42):
#     # Split the training data into training and validation sets
#     x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(
#         x_train, y_train, test_size=test_size, random_state=random_state
#     )

#     # Define the parameter grid
#     param_grid = {
#         'n_estimators': [100, 300, 500],
#         'max_depth': [6, 10, 15],
#         'learning_rate': [0.01, 0.1, 0.2],
#         'subsample': [0.8, 1.0],
#         'colsample_bytree': [0.8, 1.0]
#     }

#     # Create the XGBoost model
#     model = xgb.XGBClassifier(
#         objective='binary:logistic',
#         eval_metric='logloss',
#         use_label_encoder=False,
#         random_state=random_state
#     )
    
#     # Create GridSearchCV object
#     grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    
#     # Fit the model on the training data
#     grid_search.fit(x_train_split, y_train_split.values.ravel())  # Ensure y_train_split is 1D
    
#     # Print the best parameters and best score
#     print(f"Best parameters found: {grid_search.best_params_}")
#     print(f"Best cross-validation score: {grid_search.best_score_}")

#     # Use the best model to make predictions on the test data
#     best_model = grid_search.best_estimator_
#     y_test_pred = best_model.predict(x_test)
    
#     # Include TransactionNumber in the output and ensure it is an integer
#     transaction_numbers = x_test['TransactionNumber'].astype(int)
#     y_test_pred_df = pd.DataFrame({
#         'TransactionNumber': transaction_numbers,
#         'IsFraud': y_test_pred
#     })
    
#     # Save predictions to a CSV file
#     y_test_pred_df.to_csv('xgboost_optimized_predictions.csv', index=False)

#     return y_test_pred_df


# y_test_pred_df = xgboost_hyperparameter_tuning(x_train, y_train, x_test)

# print("Test Predictions:\n", y_test_pred_df.head())