In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

#### Loading the Dataset

In [3]:
credit_card_data = pd.read_csv('creditcard.csv')
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


#### Statistical Measures

In [4]:
# Number of Fraud Transactions and Non-Fraud Transactions
credit_card_data['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [5]:
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [6]:
legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [7]:
fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

#### Under-Sampling (Since the data is highly unbalanced)

In [8]:
legit_sample = legit.sample(n=492)

In [9]:
new_dataset = pd.concat([legit_sample,fraud],axis=0)
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
130456,79357.0,1.032844,-0.147349,1.082795,1.240146,-0.692818,0.29885,-0.520149,0.162095,0.316336,...,0.201554,0.636924,-0.205751,0.063338,0.532856,-0.24967,0.059056,0.035061,65.0,0
132748,80097.0,1.317125,0.351211,-0.039948,0.493385,0.058738,-0.587546,0.083376,-0.162911,0.055334,...,-0.340647,-0.965296,0.02351,-0.489438,0.336211,0.145794,-0.022521,0.022663,0.99,0
54854,46657.0,-1.725472,-0.824792,1.255792,0.464317,-0.445934,0.508098,-0.551182,0.764524,-1.605812,...,0.113244,0.378944,0.214219,-0.294288,-0.072484,-0.154745,0.3093,-0.010515,168.0,0
169701,119815.0,1.903792,-1.009659,-2.762209,-0.350076,0.141054,-1.34873,0.724782,-0.604459,-1.041313,...,-0.009829,0.105206,-0.215717,0.789028,0.426745,0.969525,-0.168843,-0.054586,198.0,0
114946,73685.0,-0.788285,0.941577,0.740152,-0.17836,0.285403,0.881584,-0.041754,0.827194,-0.128724,...,0.017034,0.249993,-0.26029,-1.095404,-0.075816,0.45118,0.242658,0.112069,3.83,0


In [10]:
new_dataset['Class'].value_counts()

Class
0    492
1    492
Name: count, dtype: int64

In [11]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,92500.50813,0.102984,0.129442,0.062932,-0.044251,0.104381,-0.053103,-0.0041,0.026987,-0.007595,...,0.003838,0.013662,-0.044987,-0.0127,0.014727,0.038577,-0.005192,-0.001347,-0.001814,62.721911
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


#### Splitting the Data into Features and Target

In [12]:
X = new_dataset.drop(columns='Class', axis =1)
Y = new_dataset['Class']

#### Splitting Training Data and Testing Data

In [13]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,stratify=Y,random_state=2)
print(X.shape,X_train.shape,X_test.shape)

(984, 30) (787, 30) (197, 30)


#### Training the Logistic Regression Model with Training Data 

In [14]:
model = LogisticRegression()
model.fit(X_train,Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### Evaluation

In [41]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction,Y_train)

In [42]:
print('Accuracy on Training Data :',training_data_accuracy)

Accuracy on Training Data : 0.9453621346886912


In [44]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction,Y_test)

In [45]:
print('Accuracy on Testing Data :',test_data_accuracy)

Accuracy on Testing Data : 0.9187817258883249


In [46]:
def predict_from_file(file_path):
   
    try:
       
        input_data = pd.read_csv(file_path)
        
        required_columns = X.columns
        if not set(required_columns).issubset(input_data.columns):
            raise ValueError("The uploaded dataset does not have the required columns for prediction.")
        
        input_data = input_data[required_columns]
        
        
        predictions = model.predict(input_data)
        
        input_data['Prediction'] = predictions
        input_data['Prediction'] = input_data['Prediction'].map({0: 'Legitimate', 1: 'Fraud'})
        
        output_file = "predictions.csv"
        input_data.to_csv(output_file, index=False)
        print(f"Predictions saved to {output_file}.")
        
        return input_data
    
    except Exception as e:
        print(f"An error occurred: {e}")

In [47]:
file_path = 'creditcard.csv'
predictions = predict_from_file(file_path)

Predictions saved to predictions.csv.


In [36]:
print(predictions.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 