In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.base import TransformerMixin
import warnings
warnings.filterwarnings("ignore")

In [2]:
creditcard_dataset1 = pd.read_csv('fraudTest(CreditCard).csv')

In [3]:
creditcard_dataset1.shape

(555719, 23)

In [4]:
creditcard_dataset2 = pd.read_csv('fraudTrain(CreditCard).csv')

In [5]:
creditcard_dataset2.shape

(1296675, 23)

In [6]:
creditcard_dataset3 = pd.concat([creditcard_dataset1, creditcard_dataset2], axis=0)

In [7]:
creditcard_dataset3.shape

(1852394, 23)

In [8]:
creditcard_dataset3.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [9]:
creditcard_dataset3.columns = creditcard_dataset3.columns.str.strip()

In [10]:
creditcard_dataset3['is_fraud'].value_counts()

is_fraud
0    1842743
1       9651
Name: count, dtype: int64

In [11]:
if 'trans_date_trans_time' in creditcard_dataset3.columns:
    creditcard_dataset3[['trans_date', 'trans_time']] = creditcard_dataset3['trans_date_trans_time'].str.split(' ', expand=True)

In [12]:
numerical_features = creditcard_dataset3.select_dtypes(include=np.number).columns.tolist()
categorical_features = creditcard_dataset3.select_dtypes(include='object').columns.tolist()

In [13]:
if 'is_fraud' in numerical_features:
    numerical_features.remove('is_fraud')

In [14]:
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=10))  
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [15]:
X = creditcard_dataset3.drop(columns=['is_fraud'],axis=1)
y = creditcard_dataset3['is_fraud']

In [16]:
X[categorical_features] = X[categorical_features].replace({r'-': '', r':': '', r'.':'', r',': ''}, regex=True)
X[categorical_features] = X[categorical_features].astype(str)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

In [18]:
print(X)

         Unnamed: 0 trans_date_trans_time               cc_num merchant  \
0                 0                           2291163933867244            
1                 1                           3573030041201292            
2                 2                           3598215285024754            
3                 3                           3591919803438423            
4                 4                           3526826139003047            
...             ...                   ...                  ...      ...   
1296670     1296670                             30263540414123            
1296671     1296671                           6011149206456997            
1296672     1296672                           3514865930894695            
1296673     1296673                           2720012583106919            
1296674     1296674                        4292902571056973207            

        category     amt first last gender street  ...      long city_pop  \
0                   2.

In [19]:
# Check for problematic columns
for column in X_train.columns:
    unique_types = set(type(value) for value in X_train[column].dropna())
    if len(unique_types) > 1:
        print(f"Column '{column}' has mixed types: {unique_types}")

In [20]:
# Convert problematic columns to strings
for column in X_train.columns:
    if len(set(type(value) for value in X_train[column].dropna())) > 1:
        X_train[column] = X_train[column].astype(str)
        X_test[column] = X_test[column].astype(str)

In [21]:
X

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,trans_date,trans_time
0,0,,2291163933867244,,,2.86,,,,,...,-80.9355,333497,,,,1371816865,33.986391,-81.200714,,
1,1,,3573030041201292,,,29.84,,,,,...,-110.4360,302,,,,1371816873,39.450498,-109.960431,,
2,2,,3598215285024754,,,41.28,,,,,...,-73.5365,34496,,,,1371816893,40.495810,-74.196111,,
3,3,,3591919803438423,,,60.05,,,,,...,-80.8191,54767,,,,1371816915,28.812398,-80.883061,,
4,4,,3526826139003047,,,3.19,,,,,...,-85.0170,1126,,,,1371816917,44.959148,-85.884734,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,1296670,,30263540414123,,,15.56,,,,,...,-112.4777,258,,,,1371816728,36.841266,-111.690765,,
1296671,1296671,,6011149206456997,,,51.70,,,,,...,-77.5101,100,,,,1371816739,38.906881,-78.246528,,
1296672,1296672,,3514865930894695,,,105.93,,,,,...,-105.8189,899,,,,1371816752,33.619513,-105.130529,,
1296673,1296673,,2720012583106919,,,74.90,,,,,...,-102.5411,1126,,,,1371816816,42.788940,-103.241160,,


In [22]:
# Create and fit the pipeline with Gaussian Naive Bayes
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', GaussianNB())
])
# Fit the model
model.fit(X_train, y_train)

test_data_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(y_test, test_data_prediction)
print("Accuarcy on test data:",test_data_accuracy)

Accuarcy on test data: 0.9916540553769081


In [23]:
# Create and fit the pipeline with Logistic Regression
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression())
])
# Fit the model
model.fit(X_train, y_train)

test_data_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(test_data_prediction,y_test)
print("Accuarcy on test data:",test_data_accuracy)

Accuarcy on test data: 0.9943712559764917


In [None]:
# Create and fit the pipeline with svm
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='linear'))
])
# Fit the model
model.fit(X_train, y_train)

test_data_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(test_data_prediction,y_test)
print("Accuarcy on test data:",test_data_accuracy)

In [None]:
k=5
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('to_dense', DenseTransformer()),  # Ensure the data is dense
    ('classifier', KNeighborsClassifier(n_neighbors=k))
])
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

test_data_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(test_data_prediction,y_test)
print("Accuarcy on test data:",test_data_accuracy)

In [None]:
input_data = (0, "21-06-2020", "12:14", "2.29116E+15", "fraud_Kirlin and Sons", "personal_care", 2.86, "Jeff", "Elliott", "M", "351 Darlene Green", "Columbia", "SC", "29209", 33.9659, -80.9355, 333497, "Mechanical engineer", "19-03-1968", "2da90c7d74bd46a0caf3777415b3ebd3", 1371816865, 33.986391, -81.200714, 0)
input_data_as_numpy_array=np.asarray(input_data)
input_data_reshaped=input_data_as_numpy_array.reshape(1,-1)
prediction=model.predict(input_data_reshaped)
print(prediction)
if(prediction[0]==1):
    print("Transaction is fraudlent")
else:
    print("Transaction is legitimate")