# CREDIT CARD FRAUD DETECTION
## To Do:
### Build a model to detect fraudulent credit card transactions. Use a dataset containing information about credit card transactions, and experiment with algorithms like Logistic Regression, Decision Trees, or Random Forests to classify transactions as fraudulent or legitimate.

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [2]:
#define path
Test_path = pd.read_csv("C:/Users/CHARAN/Downloads/Credit Card Transactions Fraud Detection Dataset/fraudTrain.csv")
Train_path = pd.read_csv("C:/Users/CHARAN/Downloads/Credit Card Transactions Fraud Detection Dataset/fraudTest.csv")

In [3]:
#by changing frac value you can reduce the Test data size
Test_sample = Test_path.sample(frac = 1)
Train_sample = Train_path.sample(frac = 1)

In [4]:
# check details
Test_sample.head()
Test_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1296675 entries, 213906 to 61455
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long           

In [5]:
#define non numeric columns
non_numeric_columns = Train_sample.select_dtypes(include=['object']).columns
non_numeric_columns

Index(['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'],
      dtype='object')

In [6]:
# Create a copy of Test_sample
Train_sample_num = Train_sample.copy()
Test_sample_num = Test_sample.copy()

In [7]:
#drop all object columns
Train_sample_num = Train_sample_num.select_dtypes(exclude=['object'])
Test_sample_num = Test_sample_num.select_dtypes(exclude=['object'])

In [8]:
Train_sample_num.head()
Test_sample_num.head()

Unnamed: 0.1,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long,is_fraud
213906,213906,4874017206859125,130.75,97034,45.4093,-122.6847,42817,1334841714,44.817806,-123.427253,0
232018,232018,373905417449658,56.38,76665,31.929,-97.6443,2526,1335529002,31.377478,-97.162615,0
1105033,1105033,4671727014157745,151.02,46702,40.8618,-85.6067,2304,1365135654,41.743377,-85.049163,0
1135147,1135147,4792627764422477317,100.81,66018,38.9462,-94.9714,5760,1366262377,39.072818,-95.879294,0
872109,872109,38199021865320,70.96,3601,43.196,-72.3001,477,1356032427,42.792485,-71.743069,0


In [9]:
#Distribution of No Fraud vs Fraud
Train_sample_num["is_fraud"].value_counts()

0    553574
1      2145
Name: is_fraud, dtype: int64

In [10]:
#This dataset is Higly Unbalanced
#Separationg data for analysis
legit = Train_sample_num[Train_sample.is_fraud == 0]
fraud = Train_sample_num[Train_sample.is_fraud == 1]

In [11]:
print (legit.shape)
print (fraud.shape)

(553574, 11)
(2145, 11)


In [12]:
#stratistical measures for data
legit.amt.describe()
fraud.amt.describe()

count    2145.000000
mean      528.356494
std       392.747594
min         1.780000
25%       214.510000
50%       371.940000
75%       907.770000
max      1320.920000
Name: amt, dtype: float64

In [13]:
#compare the value for both transfaction
Train_sample_num.groupby('is_fraud').mean()

Unnamed: 0_level_0,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,277997.72599,4.179643e+17,67.614408,48846.424285,38.541406,-90.230495,88313.692365,1380683000.0,38.540954,-90.230474
1,242057.088112,3.854274e+17,528.356494,47862.900233,39.019971,-90.445556,64529.321678,1379587000.0,39.018618,-90.465222


In [14]:
#Under Sampling and create new dataset
print ("fraudalant tranfaction :"+str(fraud.shape[0]))

fraudalant tranfaction :2145


In [15]:
#randomly select legimate transaction data equal to fraudalant data present in dataset
legit_sample = legit.sample(n=fraud.shape[0])

In [16]:
#concatenating two dataset

New_train_dataset = pd.concat([legit_sample,fraud], axis = 0)

New_train_dataset.head()

New_train_dataset["is_fraud"].value_counts()

New_train_dataset.select_dtypes(include=['number']).groupby('is_fraud').mean()

Unnamed: 0_level_0,Unnamed: 0,cc_num,amt,zip,lat,long,city_pop,unix_time,merch_lat,merch_long
is_fraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,282557.201399,4.473403e+17,79.457054,49839.053613,38.717813,-90.599997,84072.37669,1380829000.0,38.709127,-90.601849
1,242057.088112,3.854274e+17,528.356494,47862.900233,39.019971,-90.445556,64529.321678,1379587000.0,39.018618,-90.465222


In [17]:
X_train= New_train_dataset.drop(columns = 'is_fraud', axis = 1)
Y_train = New_train_dataset['is_fraud']

X_test = Test_sample_num.drop(columns = 'is_fraud', axis = 1)
Y_test = Test_sample_num['is_fraud']

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

Decision_prediction = model.predict(X_test)

Accuracy=accuracy_score(Y_test, Decision_prediction)
Con_matrix=confusion_matrix(Y_test, Decision_prediction)
report=classification_report(Y_test, Decision_prediction)
print("Accuracy of test data for Decision Tree Regressor : ",Accuracy )
print("Confution Matrix Score : ",Con_matrix)
print("Report : ",report)

Accuracy of test data for Decision Tree Regressor :  0.3507363063219388
Confution Matrix Score :  [[451545 837624]
 [  4260   3246]]
Report :                precision    recall  f1-score   support

           0       0.99      0.35      0.52   1289169
           1       0.00      0.43      0.01      7506

    accuracy                           0.35   1296675
   macro avg       0.50      0.39      0.26   1296675
weighted avg       0.98      0.35      0.51   1296675



In [18]:
# Create Text File with Report and Accuracy
result_path = 'C:/Users/CHARAN/Downloads/Credit Card Transactions Fraud Detection Dataset/Decision Tree Regressor_Report.txt'
with open(result_path,'w') as f:
    f.write("Classification Report:\n")
    f.write(str(report))
    f.write("\n")
    f.write(str(Accuracy))

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Initialize the Logistic Regression model
model = LogisticRegression()

# Fit the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test data
Logistic_prediction = model.predict(X_test)

# Calculate accuracy
Accuracy = accuracy_score(Y_test, Logistic_prediction)

# Calculate confusion matrix
Con_matrix = confusion_matrix(Y_test, Logistic_prediction)

# Generate classification report
report = classification_report(Y_test, Logistic_prediction)

# Print the results
print("Accuracy of test data for Logistic Regression: ", Accuracy)
print("Confusion Matrix: ", Con_matrix)
print("Classification Report: ", report)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Accuracy of test data for Logistic Regression:  0.9942113482561166
Confusion Matrix:  [[1289169       0]
 [   7506       0]]
Classification Report:                precision    recall  f1-score   support

           0       0.99      1.00      1.00   1289169
           1       0.00      0.00      0.00      7506

    accuracy                           0.99   1296675
   macro avg       0.50      0.50      0.50   1296675
weighted avg       0.99      0.99      0.99   1296675



  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
# Create Text File with Report and Accuracy
result_path = 'C:/Users/CHARAN/Downloads/Credit Card Transactions Fraud Detection Dataset/Logistic Regression_Report.txt'
with open(result_path,'w') as f:
    f.write("Classification Report:\n")
    f.write(str(report))
    f.write("\n")
    f.write(str(Accuracy))

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Splitting features and target variable for training and testing datasets
X_train = New_train_dataset.drop(columns='is_fraud', axis=1)
Y_train = New_train_dataset['is_fraud']

X_test = Test_sample_num.drop(columns='is_fraud', axis=1)
Y_test = Test_sample_num['is_fraud']

# Initialize the Random Forest model
model = RandomForestClassifier()

# Fit the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the test data
RandomForest_prediction = model.predict(X_test)

# Calculate accuracy
Accuracy = accuracy_score(Y_test, RandomForest_prediction)

# Calculate confusion matrix
Con_matrix = confusion_matrix(Y_test, RandomForest_prediction)

# Generate classification report
report = classification_report(Y_test, RandomForest_prediction)

# Print the results
print("Accuracy of test data for Random Forest: ", Accuracy)
print("Confusion Matrix: ", Con_matrix)
print("Classification Report: ", report)


Accuracy of test data for Random Forest:  0.9646457284978889
Confusion Matrix:  [[1245987   43182]
 [   2661    4845]]
Classification Report:                precision    recall  f1-score   support

           0       1.00      0.97      0.98   1289169
           1       0.10      0.65      0.17      7506

    accuracy                           0.96   1296675
   macro avg       0.55      0.81      0.58   1296675
weighted avg       0.99      0.96      0.98   1296675



In [22]:
# Create Text File with Report and Accuracy
result_path = 'C:/Users/CHARAN/Downloads/Credit Card Transactions Fraud Detection Dataset/Random Forest_Report.txt'
with open(result_path,'w') as f:
    f.write("Classification Report:\n")
    f.write(str(report))
    f.write("\n")
    f.write(str(Accuracy))