In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, accuracy_score
from imblearn.combine import SMOTETomek
import joblib

In [2]:
train_data = pd.read_csv('/content/fraudTrain.csv', on_bad_lines='skip')
test_data = pd.read_csv('/content/fraudTest.csv', on_bad_lines='skip')

In [3]:
train_data = train_data.drop(columns=['trans_num'])
test_data = test_data.drop(columns=['trans_num'])

In [4]:
train_data

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,zip,lat,long,city_pop,job,dob,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2.703186e+15,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,28654.0,36.0788,-81.1781,3495.0,"Psychologist, counselling",1988-03-09,1.325376e+09,36.011293,-82.048315,0.0
1,1,2019-01-01 00:00:44,6.304233e+11,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,99160.0,48.8878,-118.2105,149.0,Special educational needs teacher,1978-06-21,1.325376e+09,49.159047,-118.186462,0.0
2,2,2019-01-01 00:00:51,3.885949e+13,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,83252.0,42.1808,-112.2620,4154.0,Nature conservation officer,1962-01-19,1.325376e+09,43.150704,-112.154481,0.0
3,3,2019-01-01 00:01:16,3.534094e+15,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.00,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,59632.0,46.2306,-112.1138,1939.0,Patent attorney,1967-01-12,1.325376e+09,47.034331,-112.561071,0.0
4,4,2019-01-01 00:03:06,3.755342e+14,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,24433.0,38.4207,-79.4629,99.0,Dance movement psychotherapist,1986-03-28,1.325376e+09,38.674999,-78.632459,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19471,19471,2019-01-12 20:36:38,3.564839e+15,fraud_Altenwerth-Kilback,home,18.44,Robert,Ashley,M,1250 Christopher Prairie Suite 016,...,94569.0,38.0460,-122.1866,198.0,Armed forces training and education officer,1959-03-31,1.326401e+09,38.076435,-122.601640,0.0
19472,19472,2019-01-12 20:36:55,4.836999e+15,"fraud_Mosciski, Ziemann and Farrell",shopping_net,9.33,Susan,Hardy,F,516 Brown Parks,...,49854.0,46.0062,-86.2555,6469.0,Trade mark attorney,1979-04-12,1.326401e+09,46.011639,-85.711983,0.0
19473,19473,2019-01-12 20:37:04,3.050162e+13,fraud_Stark-Koss,home,21.82,Amanda,Smith,F,180 Graves Shore,...,29939.0,32.6786,-81.2455,302.0,Magazine features editor,1973-05-04,1.326401e+09,33.048939,-80.887348,0.0
19474,19474,2019-01-12 20:38:55,5.456713e+15,fraud_Gerhold LLC,home,4.52,Pamela,Matthews,F,36659 Smith Club Apt. 080,...,87543.0,36.1486,-105.6648,247.0,Architectural technologist,1961-10-24,1.326401e+09,36.567927,-105.635082,0.0


In [5]:
train_data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'unix_time', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')

In [6]:
train_data['trans_date_trans_time'] = pd.to_datetime(train_data['trans_date_trans_time']).astype('int64') / 10**9
test_data['trans_date_trans_time'] = pd.to_datetime(test_data['trans_date_trans_time']).astype('int64') / 10**9

In [7]:
train_data.drop_duplicates(inplace=True)

In [8]:
X_train = train_data.drop('is_fraud', axis=1)
y_train = train_data['is_fraud'].fillna(train_data['is_fraud'].mode()[0])
X_test = test_data.drop('is_fraud', axis=1)
y_test = test_data['is_fraud'].fillna(train_data['is_fraud'].mode()[0])

In [9]:
numeric_features = X_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X_train.select_dtypes(include=[object]).columns.tolist()

In [10]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

In [11]:
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [12]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [13]:
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_test_preprocessed = preprocessor.transform(X_test)

In [14]:
train_data_sample = train_data.sample(frac=0.1, random_state=42)
X_train_sample = train_data_sample.drop('is_fraud', axis=1)
y_train_sample = train_data_sample['is_fraud'].fillna(train_data_sample['is_fraud'].median())

In [15]:
X_train_sample_preprocessed = preprocessor.transform(X_train_sample)

In [16]:
from imblearn.over_sampling import ADASYN

adasyn = ADASYN()

In [17]:
X_train_resampled, y_train_resampled  = adasyn.fit_resample(X_train_preprocessed, y_train)

In [18]:
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
}

In [19]:
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

In [20]:
from sklearn.model_selection import RandomizedSearchCV

In [21]:
random_search = RandomizedSearchCV(estimator=xgb, param_distributions=param_dist, n_iter=20, cv=3, scoring='f1', n_jobs=-1, random_state=42)

In [22]:
try:
    random_search.fit(X_train_resampled, y_train_resampled)
    best_model = random_search.best_estimator_

    predictions = best_model.predict(X_test_preprocessed)
    accuracy = accuracy_score(y_test, predictions)
    print(f'Accuracy: {accuracy}')

    report = classification_report(y_test, predictions)
    print("Classification Report:")
    print(report)

    joblib.dump(best_model, 'credit_card_fraud_model.pkl')
    joblib.dump(preprocessor, 'preprocessor.pkl')


except Exception as e:
    print(f"RandomizedSearchCV failed: {e}")

Accuracy: 0.9973286756395767
Classification Report:
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     19398
         1.0       0.69      0.43      0.53        68

    accuracy                           1.00     19466
   macro avg       0.84      0.71      0.76     19466
weighted avg       1.00      1.00      1.00     19466

