## Import the dependencies

In [1]:
import pandas as pd #for data manipulation
from sklearn.model_selection import train_test_split #to split data into training and testing sets
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler #for scaling numerical features
from sklearn.model_selection import GridSearchCV #for hyperparamwter tuning of the machine learning model
from sklearn.compose import make_column_transformer, make_column_selector #for selecting specific columns from a dataframe
from sklearn.pipeline import Pipeline #for chaining multiple preprocessing steps and a machine learning model

## Data collection and analysis

In [2]:
pd.set_option("display.max_columns", None)

# Load the credit card transaction data into a Pandas dataframe
card = pd.read_csv("creditcard.csv")
# Print the shape of the dataset and the first few rows
print(card.shape)
card.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
# Compare the values for each transaction
# Group the transactions by the "Class" column and calculate the mean for each group
card.groupby("Class").mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,0.009824,-0.006576,0.010832,0.000189,0.012064,0.000161,0.007164,0.011535,0.003887,-0.001178,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,-5.676883,3.800173,-6.259393,-0.109334,-6.971723,-0.092929,-4.139946,-6.665836,-2.246308,0.680659,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


### Unbalanced data
#### 0 --> Legit Transcations
#### 1 --> Fraudulent Transactions


In [4]:
# Check class distribution
# Print the number of transactions in each class (0 or 1)
print(card.Class.value_counts())

0    284315
1       492
Name: Class, dtype: int64


## Pre-processing

In [5]:
# Separate the data into two dataframes: one with legit transactions and one with fraudulent transactions
legit = card[card.Class == 0]
fraud = card[card.Class == 1]
# Print the shape of each dataframe
print(legit.shape)
print(fraud.shape)

# Sample an equal number of legit transactions and combine them with the fraudulent transactions
legit_sample = legit.sample(n=len(fraud), random_state=34)
new_data = pd.concat([legit_sample, fraud], axis=0).sample(frac=1, random_state=2)
print(new_data.head())
print(new_data.shape)

# Print the number of transactions in each class after resampling
print(new_data.Class.value_counts())

(284315, 31)
(492, 31)
            Time         V1        V2         V3         V4         V5  \
204952  135520.0   2.219705 -1.679526  -0.468777  -1.598043  -1.616793   
11343    19762.0 -14.179165  7.421370 -21.405836  11.927512  -7.974281   
150715   93965.0 -11.397727  7.763953 -18.572307   6.711855 -10.174216   
280363  169479.0  -1.874677  2.118053  -0.711324  -0.139199  -0.926492   
58761    48533.0   1.243848  0.524526  -0.538884   1.209196   0.479538   

              V6         V7        V8        V9        V10        V11  \
204952 -0.089089  -1.658064  0.069597 -0.911575   1.695043   0.051015   
11343  -2.202710 -15.471612 -0.356595 -6.380125 -13.348278  10.187587   
150715 -4.395918 -15.893788  2.083013 -4.988837 -15.346099   7.093182   
280363 -0.825773  -0.723654  1.232934  0.543196  -0.861266  -1.006420   
58761  -0.197429   0.049166  0.037792  0.128119  -0.552903  -0.668359   

              V12       V13        V14       V15        V16        V17  \
204952  -0.423192  

## Instantiate preprocessors and Logistic Regression Classifier

In [6]:
# Create a logistic regression model with a column transformer to scale the numerical features
# The max_iter parameter is set to 250 and the solver is set to "saga"
clf = LogisticRegression(max_iter=250,solver="saga")
scaler = StandardScaler()

# Select columns by data type
# Select the numerical columns with make_column_selector
num_col = make_column_selector(dtype_include="number")
# Use make_column_transformer to apply the scaler to the numerical columns
ct = make_column_transformer((scaler, num_col))

# Create a pipeline with the column transformer and the logistic regression model
pipe = Pipeline([("C_transformer", ct), ("classifier", clf)])
# Print the parameters of the pipeline
pipe.get_params()

## Split the data into features and targets
X = new_data.drop(columns="Class", axis=1)
y = new_data.Class

## Model training and Evaluation

In [7]:
# Define a dictionary of hyperparameters to search over with GridSearchCV
params = dict(classifier__C=[0.001, 0.01, 0.1, 1, 10],
              classifier__penalty=["l1", "l2", "elasticnet", "none"],
             classifier__l1_ratio=[.1,.25,.4,.65,.80,.95])

# Create a GridSearchCV object with the pipeline, hyperparameters, and scoring metric
# Use 5-fold cross-validation and parallelize the search with n_jobs=-1
grid = GridSearchCV(pipe, params, cv=5, scoring="roc_auc", refit=True, n_jobs=-1)
# Fit the GridSearchCV object to the data
grid.fit(X, y)

print(grid.best_score_)
print(grid.best_params_)

0.9784559463859756
{'classifier__C': 0.1, 'classifier__l1_ratio': 0.25, 'classifier__penalty': 'elasticnet'}
