In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session



DATA LOADING & PlOTTING

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# Import file part and data and reduce memory cost by data 
# Import path files
file_path = '../input/ieee-fraud-detection'
# import file 
train_identity = pd.read_csv('../input/ieee-fraud-detection/train_identity.csv')
#reduce the file 
train_identity = reduce_mem_usage(train_identity)
train_transaction = pd.read_csv(f'{file_path}/train_transaction.csv')
train_transaction = reduce_mem_usage(train_transaction)
test_identity = pd.read_csv(f'{file_path}/test_identity.csv')
train_identity = reduce_mem_usage(test_identity)
test_transaction = pd.read_csv(f'{file_path}/test_transaction.csv')
test_transaction = reduce_mem_usage(test_transaction)



In [None]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
test.columns = train.drop('isFraud', axis=1).columns

del train_identity
del train_transaction
del test_identity
del test_transaction
gc.collect()

In [None]:
# Creating a submission file
submission = pd.DataFrame({'TransactionID':test.TransactionID})
print(submission.shape)

In [None]:
import seaborn as sns
#from matplotlib import pyplot as plt
#plt.figure(figsize=(10,10))

#sns.histplot(data= train,  x="TransactionDT", hue = "isFraud")

In [None]:
#plt.figure(figsize=(10,10))

#sns.histplot(data= train,  x="TransactionAmt",bins = 30, hue = "isFraud", log_scale = True)



Choose data set and standardize data 

In [None]:

# Preprocessing X and create a validation subset 
train.sort_values('TransactionDT', inplace=True)
X_test = test
del test
gc.collect()
X = train.drop(['isFraud'], axis=1)
length_X_train_val = len(X)
# Cleaning data process 
X_train_test_combined = pd.concat([X.drop(columns= 'TransactionID'), X_test.drop(columns='TransactionID')])
# Missing values check
# Dropping columns with more than 20% missing values 
mv = X_train_test_combined.isnull().sum()/len(X_train_test_combined)
X_train_test_combined = X_train_test_combined.drop(columns=mv[mv>0.2].index)
print(X_train_test_combined.shape)



In [None]:
from sklearn.impute import SimpleImputer
#Fill the missing data in num columns
# Find the column with numbers 
X_train_test_num = X_train_test_combined.select_dtypes(include=np.number)
print(X_train_test_num.shape)
imp_median = SimpleImputer(missing_values=np.nan, strategy='median')
X_num_df = pd.DataFrame(imp_median.fit_transform(X_train_test_num), columns=X_train_test_num.columns)
del X_train_test_num
print(X_num_df.shape)
# Fill the missing data in catogorial columns with the most frequent
X_train_test_cat = X_train_test_combined.select_dtypes(exclude=np.number)
print(X_train_test_cat.shape)
imp_most_freq = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_cat_df = pd.DataFrame(imp_most_freq.fit_transform(X_train_test_cat), columns=X_train_test_cat.columns)
del X_train_test_cat
print(X_cat_df.shape)
gc.collect()


In [None]:
# Combine num and cat data
X_data_after_cleaning = pd.concat([X_num_df, X_cat_df], axis=1)
del X_num_df, X_cat_df


In [None]:
from sklearn.preprocessing import StandardScaler
# One hot coding 
X_data_encoded = pd.get_dummies(X_data_after_cleaning, drop_first=True)



# Seperate the train_valuate and test data 
X_train_val = X_data_encoded.iloc[:length_X_train_val]
X_test_data = X_data_encoded.iloc[length_X_train_val:]
print(X_test_data.shape)


y = train['isFraud']

splitting_index = int(0.70*len(X_train_val))
print("splitting index:",splitting_index)
X_train = X_train_val.iloc[:splitting_index].values
X_val = X_train_val.iloc[splitting_index:].values
y_train = y.iloc[:splitting_index].values
y_val = y.iloc[splitting_index:].values
X_test_data = X_test_data.values
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
del y, train
del X_data_encoded


In [None]:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test_data)

pd.value_counts(y_train)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)
print(X_train_smote.shape, y_train_smote.shape)
del X_train_scaled, y_train
pd.value_counts(y_train_smote)

In [None]:
# Use parameter search for XGBoost
"""from sklearn.model_selection import RandomizedSearchCV

import xgboost as xgb
params = { 'max_depth': [3, 5, 6, 10, 15, 20],
           'learning_rate': [0.01, 0.1, 0.2, 0.3],
           'subsample': np.arange(0.5, 1.0, 0.1),
           'colsample_bytree': np.arange(0.4, 1.0, 0.1),
           'colsample_bylevel': np.arange(0.4, 1.0, 0.1),
           'n_estimators': [100, 500, 1000]}
xgbr = xgb.XGBClassifier(seed = 20)
clf = RandomizedSearchCV(estimator=xgbr,
                         param_distributions=params,
                         scoring='neg_mean_squared_error',
                         n_iter=25,
                         verbose=1)
clf.fit(X_train_smote, y_train_smote)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))"""


In [None]:
"""import xgboost as xgb
clf = xgb.XGBClassifier( 
        n_estimators=2000,
        max_depth=12, 
        learning_rate=0.02, 
        subsample=0.8,
        colsample_bytree=0.4, 
        missing=-1, 
        eval_metric='auc',
        # USE CPU
        nthread=4,
        tree_method='hist' 
        # USE GPU
        #tree_method='gpu_hist' 
    )
clf.fit(X_train_smote, y_train_smote)"""

In [None]:
# Baysian optimization
import xgboost as xgb
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [None]:
from sklearn.metrics import accuracy_score
"""def objective(space):
    clf=xgb.XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train_smote, y_train_smote), ( X_val_scaled, y_val)]
    
    clf.fit(X_train_smote, y_train_smote,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=100,verbose=False)
    

    y_predproba = clf.predict(X_val_scaled)
    accuracy = accuracy_score(y_val, y_predproba>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }"""

In [None]:
"""trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)"""

In [None]:
"""print("The best hyperparameters are : ","\n")
print(best_hyperparams)"""

In [None]:
"""{'colsample_bytree': 0.5410395070950723, 'gamma': 2.7097670582231093, 'max_depth': #18.0, 'min_child_weight': 2.0, 'reg_alpha': 41.0, 'reg_lambda': 0.4283447596194464}"""

In [None]:
clf=xgb.XGBClassifier(
                    n_estimators =180, max_depth =18 , gamma = 2.709767,
                    reg_alpha =41 ,min_child_weight=2,
                    colsample_bytree=0.541)
    
evaluation = [( X_train_smote, y_train_smote), ( X_val_scaled, y_val)]
    
clf.fit(X_train_smote, y_train_smote,
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=100,verbose=False)
    

y_predproba = clf.predict(X_val_scaled)
accuracy = accuracy_score(y_val, y_predproba>0.5)
print ("SCORE:", accuracy)


In [None]:
from sklearn.metrics import roc_auc_score

y_predproba = clf.predict_proba(X_val_scaled)[:,1]
print(f'Validation AUC={roc_auc_score(y_val, y_predproba)}')



Prediction Submission 

In [None]:
# Predict in valuation set 
y_pred_test = clf.predict_proba(X_test_scaled)[:,1]
submission['isFraud'] = y_pred_test
print(submission.shape)
submission.head()
submission.to_csv('submission.csv', index=False)
print('Submission is successful!')
