In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
train_transaction = pd.read_csv('/content/drive/MyDrive/Colab/Fraud Detection (1)/train_transaction.csv')
train_identity = pd.read_csv('/content/drive/MyDrive/Colab/Fraud Detection (1)/train_identity.csv')
test_transaction = pd.read_csv('/content/drive/MyDrive/Colab/Fraud Detection (1)/test_transaction.csv')
test_identity = pd.read_csv('/content/drive/MyDrive/Colab/Fraud Detection (1)/test_identity.csv')

In [None]:
train_df = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test_df = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

del train_transaction
del train_identity
del test_transaction
del test_identity
gc.collect()

178

In [None]:
train_df.head()

In [None]:
# day & hour
train_df['Transaction_day'] = train_df['TransactionDT'] // (24 * 60 * 60)
train_df['Transaction_hour'] = (train_df['TransactionDT'] // (60 * 60)) % 24
test_df['Transaction_day'] = test_df['TransactionDT'] // (24 * 60 * 60)
test_df['Transaction_hour'] = (test_df['TransactionDT'] // (60 * 60)) % 24

# domain
train_df['P_emaildomain'] = train_df['P_emaildomain'].str.split('.').str[-1]
train_df['R_emaildomain'] = train_df['R_emaildomain'].str.split('.').str[-1]
test_df['P_emaildomain'] = test_df['P_emaildomain'].str.split('.').str[-1]
test_df['R_emaildomain'] = test_df['R_emaildomain'].str.split('.').str[-1]

In [None]:
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

In [None]:
cat_cols = ['ProductCD', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain'] + [f'card{i}' for i in range(1, 7)]

# Label encoding for categorical columns
for col in cat_cols:
    le = LabelEncoder()

    all_values = pd.concat([train_df[col], test_df[col]]).astype(str).unique()
    le.fit(all_values)

    train_df[col] = le.transform(train_df[col].astype(str))

del le
gc.collect()

35

In [None]:
# Drop non features
X = train_df.drop(['isFraud', 'TransactionID'], axis=1)
y = train_df['isFraud']

del train_df

X_test = test_df.drop(['TransactionID'], axis=1)

common_cols = list(set(X.columns) & set(X_test.columns))

X = X[common_cols]
X_test = X_test[common_cols]

object_cols = X.select_dtypes(include=['object']).columns.tolist()
for col in object_cols:
    X[col] = pd.to_numeric(X[col], errors='coerce')
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce')

X.fillna(-999, inplace=True)
X_test.fillna(-999, inplace=True)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

cat_cols = X.select_dtypes(include=['category', 'object']).columns.tolist()

del X
del y

gc.collect()

0

In [None]:
train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=True)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, free_raw_data=True)

del X_train
del X_val
gc.collect()

0

In [None]:
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'auc',
    'is_unbalance': True,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'device': 'cpu'
}

lgb_model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=1000,
    categorical_feature=cat_cols
)

In [None]:
# Predictions
for col in X_test.select_dtypes(include=['object']).columns:
    X_test[col] = pd.to_numeric(X_test[col], errors='coerce')
    # Replace inf and -inf with large values before filling NaN
    X_test[col] = X_test[col].replace([np.inf, -np.inf], 999999)
    # Fill NaN and convert to int
    X_test[col] = X_test[col].fillna(-999).astype(int)

test_predictions = lgb_model.predict(X_test, num_iteration=lgb_model.best_iteration)

In [None]:
submission = pd.DataFrame({
    'TransactionID': test_df['TransactionID'],
    'isFraud': test_predictions
})

submission.to_csv('submission.csv', index=False)