# Pipelines

# Importing necessary libaries

In [241]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier

In [278]:
df=pd.read_csv('lux.csv')
df.drop(['Transaction_ID','Customer_ID','Store_ID','Product_SKU'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Transaction_Date,Transaction_Time,Customer_Age,Customer_Loyalty_Tier,Location,Product_Category,Purchase_Amount,Payment_Method,Device_Type,IP_Address,Fraud_Flag,Footfall_Count
0,2025-07-27,04:04:15,56.0,Silver,San Francisco,Concealer,158.24,Mobile Payment,Desktop,239.249.58.237,0,333
1,2025-03-14,20:23:23,46.0,Platinum,Zurich,Lipstick,86.03,Credit Card,Tablet,84.49.227.90,0,406
2,2025-02-20,12:36:02,32.0,Silver,Milan,Mascara,255.69,Gift Card,Desktop,79.207.35.55,0,96
3,2025-04-25,19:09:43,60.0,Bronze,London,Serum,282.76,Gift Card,Mobile,176.194.167.253,0,186
4,2025-04-17,14:23:23,,Platinum,Miami,Serum,205.86,Gift Card,Mobile,166.31.46.111,0,179


In [237]:
df['Transaction_Date']=pd.to_datetime(df['Transaction_Date'])
df['isweekend']=df['Transaction_Date'].dt.dayofweek.apply(lambda x:1 if x>=5 else 0)
df['isweekend']

0       1
1       0
2       0
3       0
4       0
       ..
2128    1
2129    1
2130    0
2131    0
2132    1
Name: isweekend, Length: 2133, dtype: int64

In [238]:
df['hour']=pd.to_datetime(df['Transaction_Time'],format='%H:%M:%S').dt.hour
df['hour']

0        4
1       20
2       12
3       19
4       14
        ..
2128    22
2129    13
2130    10
2131     7
2132    21
Name: hour, Length: 2133, dtype: int32

In [279]:
fdf=df.drop(['Transaction_Date','Transaction_Time','IP_Address',],axis='columns')
xtrain,xtest,ytrain,ytest=train_test_split(fdf.drop('Fraud_Flag',axis='columns'),fdf['Fraud_Flag'],test_size=0.5,random_state=10)
len(ytrain[ytrain==1])

33

## Filling null values

t1=ColumnTransformer([
    ( 'num-fill',SimpleImputer(),[0] ),
    ( 'cat-fill',SimpleImputer(strategy='most_frequent'),[1,5] )
],remainder='passthrough')

xt1=t1.fit_transform(xtrain)
pd.DataFrame(xt1).head()

## OneHotEncoder - removes the dummy variables

In [281]:
t2=ColumnTransformer([
    ('ohe',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,2,3,4,6])
],remainder='passthrough')
xt2=t2.fit_transform(xt1)
pd.DataFrame(xt2).head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,36,37,38,39,40,41,42,43,44,45
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,62.0,268.31,99
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,54.0,247.8,99
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,28.0,190.38,62
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,45.0,259.38,259
4,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,52.0,160.63,224


## Scaling the data - get the values in the range between 0 - 1

In [282]:
t3=MinMaxScaler()
xt3=t3.fit_transform(xt2)
xt3

array([[0.        , 0.        , 0.        , ..., 0.93617021, 0.87370277,
        0.10888889],
       [0.        , 0.        , 0.        , ..., 0.76595745, 0.79152142,
        0.10888889],
       [0.        , 0.        , 1.        , ..., 0.21276596, 0.56144569,
        0.02666667],
       ...,
       [0.        , 0.        , 0.        , ..., 0.95744681, 0.97591858,
        0.53333333],
       [0.        , 0.        , 1.        , ..., 0.63829787, 0.37845094,
        0.51333333],
       [1.        , 0.        , 0.        , ..., 0.14893617, 0.82249469,
        0.44222222]])

## Principal Component Analysis(PCA) - use the columns that give 90% of the information

In [317]:
t4=PCA(0.90)
t4.fit_transform(xt3)
print(t4.explained_variance_ratio_)
sum(t4.explained_variance_ratio_)

[0.0791494  0.06656863 0.06081842 0.06011231 0.05820743 0.05488205
 0.05258603 0.04994531 0.02670244 0.026301   0.02578085 0.02522954
 0.02480987 0.0229541  0.02207055 0.02169211 0.021377   0.02046449
 0.01905402 0.01817063 0.0178971  0.01321675 0.01297757 0.01244078
 0.01231361 0.01216929 0.01193218 0.01187384 0.01166179 0.01153776
 0.01147517 0.01140669]


np.float64(0.9077787091867615)

## Logistic Regression Model

In [318]:
t5=LogisticRegression(max_iter=1000)

## Implementing the pipeline with all transformers

In [298]:
pipe=Pipeline([
    ('na-fillers',t1),
    ('ohe',t2),
    ('scalers',t3),
    ('pca-selectors',t4),
    ('model',t5)
])

## PIPELINE EXPLORATION

In [286]:
pipe.fit(xtrain,ytrain)

In [287]:
ypred=pipe.predict(xtest)
ypred

array([0, 0, 0, ..., 0, 0, 0])

In [319]:
pipe.score(xtest,ytest)

0.9690721649484536