In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("../data/raw/data.csv")

In [3]:
df

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000,2019-02-13T09:54:09Z,2,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2019-02-13T09:54:25Z,2,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-02-13T09:54:35Z,2,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000,2019-02-13T10:01:10Z,2,0


# 1: Create Aggregate Features
Aggregate the transaction data by customer (CustomerId) to get meaningful statistics:

In [4]:
# Total Transaction Amount per Customer
df_total_amount = df.groupby('CustomerId')['Amount'].sum().reset_index()
df_total_amount.columns = ['CustomerId', 'TotalTransactionAmount']

In [5]:
# Average Transaction Amount per Customer
df_avg_amount = df.groupby('CustomerId')['Amount'].mean().reset_index()
df_avg_amount.columns = ['CustomerId', 'AvgTransactionAmount']

In [6]:
# Transaction Count per Customer
df_transaction_count = df.groupby('CustomerId')['TransactionId'].count().reset_index()
df_transaction_count.columns = ['CustomerId', 'TransactionCount']

In [7]:
# Standard Deviation of Transaction Amounts per Customer
df_std_amount = df.groupby('CustomerId')['Amount'].std().reset_index()
df_std_amount.columns = ['CustomerId', 'StdTransactionAmount']

In [8]:
# Merge these aggregate features back into the original dataframe (if needed)
df = df.merge(df_total_amount, on='CustomerId').merge(df_avg_amount, on='CustomerId')\
                  .merge(df_transaction_count, on='CustomerId').merge(df_std_amount, on='CustomerId')

In [9]:
df

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0,109921.75,923.712185,119,3042.294251
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0,109921.75,923.712185,119,3042.294251
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0,1000.00,500.000000,2,0.000000
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0,228727.20,6019.136842,38,17169.241610
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0,228727.20,6019.136842,38,17169.241610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000,2019-02-13T09:54:09Z,2,0,2438140.00,4255.043630,573,22554.029939
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2019-02-13T09:54:25Z,2,0,58499.60,1360.455814,43,2274.756582
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2019-02-13T09:54:35Z,2,0,58499.60,1360.455814,43,2274.756582
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000,2019-02-13T10:01:10Z,2,0,851985.00,1625.925573,524,3207.920536


## 2: Extract Features
Extract useful time-based features from the TransactionStartTime column.

In [10]:
# Convert 'TransactionStartTime' to datetime if it's not already
df['TransactionStartTime'] = pd.to_datetime(df['TransactionStartTime'])

In [11]:
# Extract Hour, Day, Month, and Year
df['TransactionHour'] = df['TransactionStartTime'].dt.hour
df['TransactionDay'] = df['TransactionStartTime'].dt.day
df['TransactionMonth'] = df['TransactionStartTime'].dt.month
df['TransactionYear'] = df['TransactionStartTime'].dt.year

In [12]:
df

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,...,PricingStrategy,FraudResult,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount,TransactionHour,TransactionDay,TransactionMonth,TransactionYear
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,...,2,0,109921.75,923.712185,119,3042.294251,2,15,11,2018
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,109921.75,923.712185,119,3042.294251,2,15,11,2018
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,...,2,0,1000.00,500.000000,2,0.000000,2,15,11,2018
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,...,2,0,228727.20,6019.136842,38,17169.241610,3,15,11,2018
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,228727.20,6019.136842,38,17169.241610,3,15,11,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,2438140.00,4255.043630,573,22554.029939,9,13,2,2019
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,...,2,0,58499.60,1360.455814,43,2274.756582,9,13,2,2019
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,58499.60,1360.455814,43,2274.756582,9,13,2,2019
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,...,2,0,851985.00,1625.925573,524,3207.920536,10,13,2,2019


## 3: Encode Categorical Variables

In [13]:
# One-Hot Encoding for categorical variables with binary (0 and 1) output
df = pd.get_dummies(df, columns=['CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId'], dtype=int)

In [14]:
df

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,...,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,...,0,0,0,0,0,0,0,0,1,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,-20.0,20,2018-11-15 02:19:08+00:00,2,0,...,0,0,0,0,0,0,0,1,0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,500.0,500,2018-11-15 02:44:21+00:00,2,0,...,0,0,0,0,0,0,0,0,1,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,...,0,0,0,0,0,1,0,0,1,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,-644.0,644,2018-11-15 03:34:21+00:00,2,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,-1000.0,1000,2019-02-13 09:54:09+00:00,2,0,...,0,0,0,0,0,0,0,1,0,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,1000.0,1000,2019-02-13 09:54:25+00:00,2,0,...,0,0,0,0,0,0,0,0,1,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,-20.0,20,2019-02-13 09:54:35+00:00,2,0,...,0,0,0,0,0,0,0,1,0,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,3000.0,3000,2019-02-13 10:01:10+00:00,2,0,...,0,0,0,0,1,0,0,0,1,0


In [15]:
# List of columns to convert
columns_to_convert = ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId']

# Loop through each column, convert to string, extract numeric part, and convert to float
for col in columns_to_convert:
    # Ensure the column is treated as strings
    df[col] = df[col].astype(str)  # Convert to string if not already
    # Extract the numeric part and convert to float
    df[col + '_numeric'] = df[col].str.extract('(\d+)', expand=False).astype(float)

# Display the updated DataFrame
df

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,...,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,TransactionId_numeric,BatchId_numeric,AccountId_numeric,SubscriptionId_numeric,CustomerId_numeric
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,1000.0,1000,2018-11-15 02:18:49+00:00,2,0,...,0,0,0,1,0,76871.0,36123.0,3957.0,887.0,4406.0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,-20.0,20,2018-11-15 02:19:08+00:00,2,0,...,0,0,1,0,0,73770.0,15642.0,4841.0,3829.0,4406.0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,500.0,500,2018-11-15 02:44:21+00:00,2,0,...,0,0,0,1,0,26203.0,53941.0,4229.0,222.0,4683.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,20000.0,21800,2018-11-15 03:32:55+00:00,2,0,...,1,0,0,1,0,380.0,102363.0,648.0,2185.0,988.0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,-644.0,644,2018-11-15 03:34:21+00:00,2,0,...,0,0,1,0,0,28195.0,38780.0,4841.0,3829.0,988.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,-1000.0,1000,2019-02-13 09:54:09+00:00,2,0,...,0,0,1,0,0,89881.0,96668.0,4841.0,3829.0,3078.0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,1000.0,1000,2019-02-13 09:54:25+00:00,2,0,...,0,0,0,1,0,91597.0,3503.0,3439.0,2643.0,3874.0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,-20.0,20,2019-02-13 09:54:35+00:00,2,0,...,0,0,1,0,0,82501.0,118602.0,4841.0,3829.0,3874.0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,3000.0,3000,2019-02-13 10:01:10+00:00,2,0,...,0,0,0,1,0,136354.0,70924.0,1346.0,652.0,1709.0


In [16]:
df.columns

Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy',
       'FraudResult', 'TotalTransactionAmount', 'AvgTransactionAmount',
       'TransactionCount', 'StdTransactionAmount', 'TransactionHour',
       'TransactionDay', 'TransactionMonth', 'TransactionYear',
       'CurrencyCode_UGX', 'CountryCode_256', 'ProviderId_ProviderId_1',
       'ProviderId_ProviderId_2', 'ProviderId_ProviderId_3',
       'ProviderId_ProviderId_4', 'ProviderId_ProviderId_5',
       'ProviderId_ProviderId_6', 'ProductId_ProductId_1',
       'ProductId_ProductId_10', 'ProductId_ProductId_11',
       'ProductId_ProductId_12', 'ProductId_ProductId_13',
       'ProductId_ProductId_14', 'ProductId_ProductId_15',
       'ProductId_ProductId_16', 'ProductId_ProductId_19',
       'ProductId_ProductId_2', 'ProductId_ProductId_20',
       'ProductId_ProductId_21', 'ProductId_ProductId_22',
       'ProductId_ProductId_23', 'Prod

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 67 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   TransactionId                       95662 non-null  object             
 1   BatchId                             95662 non-null  object             
 2   AccountId                           95662 non-null  object             
 3   SubscriptionId                      95662 non-null  object             
 4   CustomerId                          95662 non-null  object             
 5   Amount                              95662 non-null  float64            
 6   Value                               95662 non-null  int64              
 7   TransactionStartTime                95662 non-null  datetime64[ns, UTC]
 8   PricingStrategy                     95662 non-null  int64              
 9   FraudResult                         956

In [18]:
# Identify int32 columns
int32_columns = df.select_dtypes(include=['int32']).columns

# Convert int32 columns to int64
df[int32_columns] = df[int32_columns].astype('int64')

# Save the modified dataset
df.to_csv('../data/processed/int64_converted_data_file.csv', index=False)

# Display the data types after conversion
df.dtypes

TransactionId              object
BatchId                    object
AccountId                  object
SubscriptionId             object
CustomerId                 object
                           ...   
TransactionId_numeric     float64
BatchId_numeric           float64
AccountId_numeric         float64
SubscriptionId_numeric    float64
CustomerId_numeric        float64
Length: 67, dtype: object

In [19]:
# Identify columns with 'object' data type
object_columns = df.select_dtypes(include=['object']).columns

# Convert object columns to numeric where applicable
for col in object_columns:
    try:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    except Exception as e:
        print(f"Could not convert {col}: {e}")

# Display the data types after conversion
print(df.dtypes)

# Save the updated dataset
df.to_csv('../data/processed/numeric_converted_data_file.csv', index=False)

TransactionId             float64
BatchId                   float64
AccountId                 float64
SubscriptionId            float64
CustomerId                float64
                           ...   
TransactionId_numeric     float64
BatchId_numeric           float64
AccountId_numeric         float64
SubscriptionId_numeric    float64
CustomerId_numeric        float64
Length: 67, dtype: object


## 5: Normalize/Standardize Numerical Features

In [20]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler


# List of numerical columns to be scaled
numerical_cols = ['Amount', 'Value', 'TotalTransactionAmount', 'AvgTransactionAmount', 'TransactionCount', 'StdTransactionAmount']

# 1. Normalization (Min-Max Scaling) to range [0, 1]
min_max_scaler = MinMaxScaler()

# Apply MinMaxScaler to the numerical columns
df_normalized = df.copy()
df_normalized[numerical_cols] = min_max_scaler.fit_transform(df[numerical_cols])

# Save the normalized dataset
df_normalized.to_csv('../data/processed/normalized_data_file.csv', index=False)

# 2. Standardization (Z-score scaling)
standard_scaler = StandardScaler()

# Apply StandardScaler to the numerical columns
df_standardized = df.copy()
df_standardized[numerical_cols] = standard_scaler.fit_transform(df[numerical_cols])

# Save the standardized dataset
df_standardized.to_csv('../data/processed/standardized_data_file.csv', index=False)

# Display the first few rows of the standardized dataframe
df_standardized

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,...,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,TransactionId_numeric,BatchId_numeric,AccountId_numeric,SubscriptionId_numeric,CustomerId_numeric
0,,,,,,-0.046371,-0.072291,2018-11-15 02:18:49+00:00,2,0,...,0,0,0,1,0,76871.0,36123.0,3957.0,887.0,4406.0
1,,,,,,-0.054643,-0.080251,2018-11-15 02:19:08+00:00,2,0,...,0,0,1,0,0,73770.0,15642.0,4841.0,3829.0,4406.0
2,,,,,,-0.050426,-0.076352,2018-11-15 02:44:21+00:00,2,0,...,0,0,0,1,0,26203.0,53941.0,4229.0,222.0,4683.0
3,,,,,,0.107717,0.096648,2018-11-15 03:32:55+00:00,2,0,...,1,0,0,1,0,380.0,102363.0,648.0,2185.0,988.0
4,,,,,,-0.059704,-0.075183,2018-11-15 03:34:21+00:00,2,0,...,0,0,1,0,0,28195.0,38780.0,4841.0,3829.0,988.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,,,,,,-0.062591,-0.072291,2019-02-13 09:54:09+00:00,2,0,...,0,0,1,0,0,89881.0,96668.0,4841.0,3829.0,3078.0
95658,,,,,,-0.046371,-0.072291,2019-02-13 09:54:25+00:00,2,0,...,0,0,0,1,0,91597.0,3503.0,3439.0,2643.0,3874.0
95659,,,,,,-0.054643,-0.080251,2019-02-13 09:54:35+00:00,2,0,...,0,0,1,0,0,82501.0,118602.0,4841.0,3829.0,3874.0
95660,,,,,,-0.030151,-0.056047,2019-02-13 10:01:10+00:00,2,0,...,0,0,0,1,0,136354.0,70924.0,1346.0,652.0,1709.0


In [21]:
df_standardized.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 67 columns):
 #   Column                              Non-Null Count  Dtype              
---  ------                              --------------  -----              
 0   TransactionId                       0 non-null      float64            
 1   BatchId                             0 non-null      float64            
 2   AccountId                           0 non-null      float64            
 3   SubscriptionId                      0 non-null      float64            
 4   CustomerId                          0 non-null      float64            
 5   Amount                              95662 non-null  float64            
 6   Value                               95662 non-null  float64            
 7   TransactionStartTime                95662 non-null  datetime64[ns, UTC]
 8   PricingStrategy                     95662 non-null  int64              
 9   FraudResult                         956

# 4: Handle Missing Values

In [22]:
# Drop columns where all values are missing
df = df_standardized.dropna(axis=1, how='all')

# Display the cleaned DataFrame
print("\nDataFrame after dropping columns with all missing values:")
df


DataFrame after dropping columns with all missing values:


Unnamed: 0,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount,TransactionHour,...,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,TransactionId_numeric,BatchId_numeric,AccountId_numeric,SubscriptionId_numeric,CustomerId_numeric
0,-0.046371,-0.072291,2018-11-15 02:18:49+00:00,2,0,0.170118,-0.067623,-0.311831,-0.167922,2,...,0,0,0,1,0,76871.0,36123.0,3957.0,887.0,4406.0
1,-0.054643,-0.080251,2018-11-15 02:19:08+00:00,2,0,0.170118,-0.067623,-0.311831,-0.167922,2,...,0,0,1,0,0,73770.0,15642.0,4841.0,3829.0,4406.0
2,-0.050426,-0.076352,2018-11-15 02:44:21+00:00,2,0,0.165122,-0.072568,-0.444993,-0.201992,2,...,0,0,0,1,0,26203.0,53941.0,4229.0,222.0,4683.0
3,0.107717,0.096648,2018-11-15 03:32:55+00:00,2,0,0.175567,-0.008155,-0.404020,-0.009717,3,...,1,0,0,1,0,380.0,102363.0,648.0,2185.0,988.0
4,-0.059704,-0.075183,2018-11-15 03:34:21+00:00,2,0,0.175567,-0.008155,-0.404020,-0.009717,3,...,0,0,1,0,0,28195.0,38780.0,4841.0,3829.0,988.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,-0.062591,-0.072291,2019-02-13 09:54:09+00:00,2,0,0.276904,-0.028743,0.204885,0.050586,9,...,0,0,1,0,0,89881.0,96668.0,4841.0,3829.0,3078.0
95658,-0.046371,-0.072291,2019-02-13 09:54:25+00:00,2,0,0.167759,-0.062526,-0.398330,-0.176518,9,...,0,0,0,1,0,91597.0,3503.0,3439.0,2643.0,3874.0
95659,-0.054643,-0.080251,2019-02-13 09:54:35+00:00,2,0,0.167759,-0.062526,-0.398330,-0.176518,9,...,0,0,1,0,0,82501.0,118602.0,4841.0,3829.0,3874.0
95660,-0.030151,-0.056047,2019-02-13 10:01:10+00:00,2,0,0.204153,-0.059427,0.149116,-0.166068,10,...,0,0,0,1,0,136354.0,70924.0,1346.0,652.0,1709.0


In [23]:
# Save the final dataset
df.to_csv('../data/processed/final-data.csv', index=False)