In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


In [3]:
# Load the dataset
data = pd.read_csv("C:/Users/LENOVO/Downloads/bank_transactions_data_2.csv")

# Display basic info
print(data.info())
print(data.describe())

# Count missing values
missing_values = data.isnull().sum()
print("Missing values:\n", missing_values)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2512 entries, 0 to 2511
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   TransactionID            2512 non-null   object 
 1   AccountID                2512 non-null   object 
 2   TransactionAmount        2512 non-null   float64
 3   TransactionDate          2512 non-null   object 
 4   TransactionType          2512 non-null   object 
 5   Location                 2512 non-null   object 
 6   DeviceID                 2512 non-null   object 
 7   IP Address               2512 non-null   object 
 8   MerchantID               2512 non-null   object 
 9   Channel                  2512 non-null   object 
 10  CustomerAge              2512 non-null   int64  
 11  CustomerOccupation       2512 non-null   object 
 12  TransactionDuration      2512 non-null   int64  
 13  LoginAttempts            2512 non-null   int64  
 14  AccountBalance          

There are no missing values. It was checked to see if there are missing values. But it was mentioned that there are no missing values. Eventhough there are no missing values , I'm going to set the median of the each column to the missing values if they are found.

In [4]:
# Replace missing values with the median of respective columns
data.fillna(data.median(numeric_only=True), inplace=True)

# Verify no missing values remain
print("Missing values after replacement:\n", data.isnull().sum())


Missing values after replacement:
 TransactionID              0
AccountID                  0
TransactionAmount          0
TransactionDate            0
TransactionType            0
Location                   0
DeviceID                   0
IP Address                 0
MerchantID                 0
Channel                    0
CustomerAge                0
CustomerOccupation         0
TransactionDuration        0
LoginAttempts              0
AccountBalance             0
PreviousTransactionDate    0
dtype: int64


Clearly shown that there are no missing values now.

In [8]:
print(data.columns)




Index(['TransactionAmount', 'CustomerAge', 'TransactionDuration',
       'LoginAttempts', 'AccountBalance', 'TransactionID_TX000002',
       'TransactionID_TX000003', 'TransactionID_TX000004',
       'TransactionID_TX000005', 'TransactionID_TX000006',
       ...
       'PreviousTransactionDate_2024-11-04 08:12:14',
       'PreviousTransactionDate_2024-11-04 08:12:15',
       'PreviousTransactionDate_2024-11-04 08:12:16',
       'PreviousTransactionDate_2024-11-04 08:12:17',
       'PreviousTransactionDate_2024-11-04 08:12:18',
       'PreviousTransactionDate_2024-11-04 08:12:19',
       'PreviousTransactionDate_2024-11-04 08:12:20',
       'PreviousTransactionDate_2024-11-04 08:12:21',
       'PreviousTransactionDate_2024-11-04 08:12:22',
       'PreviousTransactionDate_2024-11-04 08:12:23'],
      dtype='object', length=7298)


In [13]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Define relevant columns for logistic regression
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance','CustomerAge','TimeSinceLastTransaction']
target = 'Fraud'

# Prepare the dataset
X = data[features]
y = data[target]


# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)




# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)
data['Fraud'] |= data['LogReg_Fraud']



# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = '/kaggle/working/log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)

KeyError: "['TimeSinceLastTransaction'] not in index"

In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
# Load the dataset
file_path = "C:/Users/LENOVO/Downloads/bank_transactions_data_2.csv"
data = pd.read_csv(file_path)

# Display initial information about the dataset
print(data.head())      # Preview first few rows
print(data.info())      # Structure and types of data
print(data.describe())  # Summary statistics for numerical columns

  TransactionID AccountID  TransactionAmount      TransactionDate  \
0      TX000001   AC00128              14.09  2023-04-11 16:29:14   
1      TX000002   AC00455             376.24  2023-06-27 16:44:19   
2      TX000003   AC00019             126.29  2023-07-10 18:16:08   
3      TX000004   AC00070             184.50  2023-05-05 16:32:11   
4      TX000005   AC00411              13.45  2023-10-16 17:51:24   

  TransactionType   Location DeviceID      IP Address MerchantID Channel  \
0           Debit  San Diego  D000380  162.198.218.92       M015     ATM   
1           Debit    Houston  D000051     13.149.61.4       M052     ATM   
2           Debit       Mesa  D000235  215.97.143.157       M009  Online   
3           Debit    Raleigh  D000187  200.13.225.150       M002  Online   
4          Credit    Atlanta  D000308    65.164.3.100       M091  Online   

   CustomerAge CustomerOccupation  TransactionDuration  LoginAttempts  \
0           70             Doctor                   81 

In [16]:

# Check for missing values
print("Missing values:\n", data.isnull().sum())

# Fill or drop missing values (example: fill missing numeric columns with median)
numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
data[numeric_columns] = data[numeric_columns].fillna(data[numeric_columns].median())

# Handle categorical data (example: fill missing with mode)
categorical_columns = data.select_dtypes(include=['object']).columns
for col in categorical_columns:
    data[col] = data[col].fillna(data[col].mode()[0])

# Convert date columns to datetime format (if applicable)
if 'date' in data.columns:
    data['date'] = pd.to_datetime(data['date'])

# Select only numeric columns
numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Drop columns with too many missing values (optional)
numeric_data = numeric_data.dropna(axis=1, thresh=len(numeric_data) * 0.5)  # Keep columns with >50% non-NaN values

# Fill remaining missing values with median (optional)
numeric_data = numeric_data.fillna(numeric_data.median())


Missing values:
 TransactionID              0
AccountID                  0
TransactionAmount          0
TransactionDate            0
TransactionType            0
Location                   0
DeviceID                   0
IP Address                 0
MerchantID                 0
Channel                    0
CustomerAge                0
CustomerOccupation         0
TransactionDuration        0
LoginAttempts              0
AccountBalance             0
PreviousTransactionDate    0
dtype: int64


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Define relevant columns for logistic regression
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance','CustomerAge','TimeSinceLastTransaction']
target = 'Fraud'

# Prepare the dataset
X = data[features]
y = data[target]


# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)




# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)
data['Fraud'] |= data['LogReg_Fraud']



# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = '/kaggle/working/log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)

KeyError: "['TimeSinceLastTransaction'] not in index"

In [18]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# Define relevant columns for logistic regression
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge', 'TimeSinceLastTransaction']
target = 'Fraud'

# Prepare the dataset
X = data[features]
y = data[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)

# If you want to set 'Fraud' column to True wherever the model predicts fraud
data['Fraud'] = data['Fraud'].fillna(0)  # Assuming Fraud column exists and has NaN values
data['Fraud'] |= data['LogReg_Fraud']

# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = '/kaggle/working/log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points.head())


KeyError: "['TimeSinceLastTransaction'] not in index"

In [19]:
print(data.columns)


Index(['TransactionID', 'AccountID', 'TransactionAmount', 'TransactionDate',
       'TransactionType', 'Location', 'DeviceID', 'IP Address', 'MerchantID',
       'Channel', 'CustomerAge', 'CustomerOccupation', 'TransactionDuration',
       'LoginAttempts', 'AccountBalance', 'PreviousTransactionDate'],
      dtype='object')


In [20]:
# Check the columns in your dataset
print(data.columns)

# Update the feature list if necessary
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge', 'TimeSinceLastTransaction']

# Make sure these columns exist in your DataFrame, or update them accordingly
X = data[features]  # This should now work if the columns are present


Index(['TransactionID', 'AccountID', 'TransactionAmount', 'TransactionDate',
       'TransactionType', 'Location', 'DeviceID', 'IP Address', 'MerchantID',
       'Channel', 'CustomerAge', 'CustomerOccupation', 'TransactionDuration',
       'LoginAttempts', 'AccountBalance', 'PreviousTransactionDate'],
      dtype='object')


KeyError: "['TimeSinceLastTransaction'] not in index"

In [21]:
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge']


In [22]:
# Convert date columns to datetime
data['TransactionDate'] = pd.to_datetime(data['TransactionDate'])
data['PreviousTransactionDate'] = pd.to_datetime(data['PreviousTransactionDate'])

# Calculate time difference in seconds (or any other time unit you prefer)
data['TimeSinceLastTransaction'] = (data['TransactionDate'] - data['PreviousTransactionDate']).dt.total_seconds()

# Now you can include 'TimeSinceLastTransaction' in your features list
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge', 'TimeSinceLastTransaction']


In [23]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare the dataset
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge', 'TimeSinceLastTransaction']  # Update this as needed
target = 'TransactionType'

X = data[features]
y = data[target].map({'Non-Fraud': 0, 'Fraud': 1})  # Encode target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)
data['Fraud'] |= data['LogReg_Fraud']

# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = '/kaggle/working/log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)


ValueError: Input y contains NaN.

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Drop irrelevant columns
columns_to_drop = ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']
data = data.drop(columns=columns_to_drop, axis=1)

# Encode the target variable
data['TransactionType'] = data['TransactionType'].map({'Non-Fraud': 0, 'Fraud': 1})

# One-hot encode categorical columns
categorical_columns = ['Location', 'MerchantID', 'Channel', 'CustomerOccupation']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Fill missing values in features
data = data.fillna(data.mean())  # Fill missing numeric features with the mean

# Ensure 'Fraud' is numeric
y = data['TransactionType'].astype(int)

# Define features for logistic regression (use only numeric columns after encoding)
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge']
features.extend([col for col in data.columns if col not in ['TransactionType']])  # Add one-hot encoded columns

# Prepare the dataset for modeling
X = data[features]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)

# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = 'log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)


TypeError: Could not convert ['AC00128AC00455AC00019AC00070AC00411AC00393AC00199AC00069AC00135AC00385AC00150AC00459AC00392AC00264AC00085AC00270AC00317AC00359AC00242AC00285AC00002AC00014AC00095AC00453AC00241AC00041AC00441AC00057AC00390AC00313AC00367AC00291AC00060AC00359AC00365AC00267AC00404AC00202AC00478AC00405AC00421AC00480AC00120AC00117AC00011AC00401AC00282AC00439AC00296AC00471AC00159AC00482AC00115AC00055AC00419AC00374AC00271AC00456AC00143AC00427AC00438AC00002AC00425AC00325AC00434AC00242AC00319AC00460AC00461AC00395AC00217AC00053AC00282AC00110AC00265AC00239AC00007AC00022AC00464AC00014AC00406AC00445AC00236AC00220AC00140AC00098AC00303AC00373AC00438AC00437AC00106AC00310AC00149AC00373AC00425AC00010AC00348AC00011AC00453AC00178AC00265AC00018AC00075AC00464AC00170AC00235AC00013AC00397AC00423AC00154AC00068AC00318AC00396AC00121AC00336AC00468AC00282AC00498AC00085AC00092AC00295AC00358AC00311AC00066AC00340AC00284AC00208AC00176AC00053AC00320AC00382AC00384AC00203AC00498AC00349AC00177AC00284AC00272AC00405AC00385AC00403AC00114AC00163AC00108AC00397AC00434AC00385AC00161AC00333AC00279AC00072AC00242AC00303AC00442AC00087AC00174AC00421AC00111AC00275AC00492AC00380AC00227AC00257AC00062AC00383AC00182AC00478AC00332AC00202AC00331AC00474AC00268AC00247AC00298AC00106AC00035AC00363AC00273AC00156AC00010AC00036AC00149AC00225AC00077AC00440AC00321AC00099AC00258AC00132AC00113AC00396AC00099AC00265AC00047AC00005AC00363AC00426AC00107AC00237AC00358AC00113AC00010AC00499AC00380AC00452AC00020AC00181AC00370AC00453AC00339AC00249AC00479AC00301AC00380AC00130AC00006AC00096AC00108AC00070AC00417AC00391AC00414AC00004AC00047AC00198AC00314AC00212AC00115AC00055AC00427AC00401AC00430AC00299AC00345AC00430AC00175AC00177AC00225AC00394AC00026AC00076AC00215AC00202AC00400AC00004AC00260AC00415AC00337AC00253AC00126AC00304AC00470AC00041AC00442AC00430AC00258AC00073AC00430AC00363AC00307AC00085AC00448AC00194AC00319AC00366AC00338AC00178AC00424AC00147AC00171AC00023AC00263AC00423AC00455AC00454AC00055AC00403AC00220AC00009AC00445AC00133AC00362AC00026AC00387AC00451AC00033AC00087AC00159AC00177AC00093AC00427AC00284AC00189AC00495AC00148AC00272AC00381AC00110AC00495AC00035AC00119AC00488AC00466AC00489AC00306AC00431AC00233AC00389AC00067AC00300AC00386AC00285AC00219AC00445AC00371AC00164AC00200AC00310AC00106AC00444AC00441AC00300AC00194AC00248AC00286AC00049AC00471AC00434AC00264AC00316AC00087AC00140AC00418AC00187AC00077AC00141AC00332AC00439AC00055AC00299AC00107AC00171AC00493AC00232AC00156AC00442AC00375AC00034AC00376AC00029AC00040AC00482AC00365AC00340AC00349AC00374AC00261AC00459AC00331AC00165AC00397AC00192AC00063AC00478AC00272AC00092AC00332AC00087AC00327AC00269AC00177AC00322AC00069AC00264AC00014AC00316AC00131AC00232AC00410AC00488AC00329AC00021AC00188AC00341AC00090AC00456AC00322AC00135AC00279AC00210AC00251AC00116AC00131AC00265AC00326AC00411AC00039AC00293AC00306AC00285AC00145AC00128AC00292AC00020AC00229AC00053AC00401AC00208AC00177AC00136AC00029AC00210AC00421AC00400AC00495AC00301AC00434AC00274AC00365AC00480AC00318AC00235AC00333AC00262AC00267AC00490AC00229AC00196AC00182AC00357AC00443AC00499AC00285AC00475AC00217AC00452AC00357AC00080AC00493AC00420AC00040AC00351AC00070AC00273AC00280AC00480AC00122AC00072AC00093AC00407AC00495AC00119AC00271AC00235AC00264AC00307AC00434AC00290AC00021AC00499AC00148AC00200AC00074AC00417AC00182AC00177AC00063AC00300AC00218AC00036AC00371AC00398AC00213AC00464AC00219AC00464AC00442AC00054AC00392AC00350AC00306AC00229AC00148AC00305AC00329AC00400AC00148AC00128AC00463AC00256AC00166AC00318AC00213AC00075AC00498AC00039AC00114AC00326AC00450AC00325AC00433AC00284AC00103AC00080AC00052AC00095AC00327AC00491AC00353AC00266AC00167AC00153AC00012AC00469AC00127AC00480AC00092AC00122AC00067AC00014AC00098AC00345AC00360AC00043AC00295AC00313AC00173AC00364AC00339AC00265AC00369AC00033AC00077AC00437AC00152AC00161AC00125AC00311AC00183AC00407AC00093AC00314AC00467AC00316AC00145AC00045AC00005AC00456AC00041AC00257AC00356AC00465AC00158AC00086AC00322AC00433AC00083AC00402AC00317AC00030AC00114AC00301AC00216AC00223AC00413AC00086AC00216AC00240AC00298AC00460AC00424AC00434AC00471AC00461AC00188AC00013AC00345AC00013AC00296AC00192AC00378AC00205AC00475AC00022AC00150AC00362AC00170AC00429AC00418AC00144AC00449AC00057AC00426AC00039AC00051AC00471AC00039AC00274AC00095AC00211AC00184AC00436AC00107AC00203AC00398AC00312AC00225AC00150AC00413AC00479AC00078AC00242AC00149AC00084AC00466AC00359AC00023AC00366AC00170AC00130AC00183AC00219AC00354AC00290AC00425AC00370AC00454AC00403AC00358AC00296AC00393AC00223AC00487AC00215AC00275AC00482AC00357AC00026AC00416AC00058AC00170AC00262AC00032AC00131AC00176AC00139AC00056AC00276AC00447AC00143AC00139AC00208AC00426AC00423AC00381AC00458AC00014AC00071AC00054AC00182AC00257AC00006AC00424AC00131AC00133AC00393AC00361AC00092AC00049AC00023AC00323AC00372AC00497AC00130AC00034AC00286AC00232AC00182AC00136AC00101AC00328AC00356AC00312AC00242AC00123AC00071AC00370AC00330AC00218AC00437AC00287AC00418AC00465AC00011AC00432AC00240AC00497AC00026AC00277AC00116AC00367AC00030AC00111AC00220AC00295AC00137AC00004AC00442AC00201AC00462AC00390AC00297AC00173AC00382AC00175AC00035AC00219AC00416AC00467AC00465AC00102AC00119AC00418AC00237AC00279AC00067AC00043AC00012AC00244AC00395AC00253AC00394AC00457AC00046AC00011AC00045AC00181AC00149AC00288AC00014AC00153AC00164AC00159AC00360AC00326AC00110AC00110AC00257AC00145AC00455AC00282AC00045AC00390AC00032AC00153AC00460AC00154AC00190AC00156AC00005AC00126AC00201AC00165AC00420AC00329AC00037AC00293AC00481AC00233AC00235AC00324AC00029AC00093AC00141AC00061AC00092AC00354AC00358AC00078AC00013AC00030AC00074AC00399AC00323AC00107AC00211AC00297AC00102AC00196AC00342AC00027AC00339AC00468AC00262AC00425AC00161AC00417AC00208AC00304AC00166AC00279AC00114AC00411AC00380AC00272AC00202AC00053AC00316AC00422AC00083AC00125AC00227AC00292AC00319AC00431AC00365AC00050AC00129AC00366AC00452AC00339AC00389AC00188AC00039AC00319AC00077AC00131AC00180AC00249AC00454AC00076AC00386AC00444AC00307AC00397AC00078AC00348AC00169AC00007AC00009AC00065AC00362AC00444AC00106AC00241AC00092AC00173AC00363AC00185AC00370AC00124AC00412AC00068AC00321AC00302AC00217AC00298AC00327AC00054AC00403AC00397AC00383AC00450AC00086AC00156AC00418AC00263AC00009AC00436AC00399AC00182AC00482AC00475AC00253AC00376AC00058AC00304AC00035AC00366AC00362AC00094AC00225AC00474AC00448AC00439AC00133AC00480AC00265AC00176AC00278AC00216AC00431AC00063AC00090AC00133AC00357AC00404AC00390AC00083AC00161AC00425AC00363AC00385AC00045AC00480AC00369AC00109AC00264AC00076AC00438AC00453AC00222AC00179AC00386AC00171AC00336AC00272AC00402AC00136AC00280AC00352AC00495AC00061AC00129AC00237AC00230AC00224AC00354AC00479AC00329AC00160AC00050AC00200AC00254AC00456AC00320AC00446AC00292AC00111AC00157AC00429AC00182AC00166AC00005AC00459AC00432AC00317AC00377AC00130AC00236AC00243AC00017AC00295AC00187AC00395AC00225AC00355AC00217AC00282AC00477AC00158AC00174AC00124AC00247AC00297AC00114AC00170AC00185AC00487AC00280AC00017AC00496AC00304AC00475AC00111AC00316AC00007AC00033AC00453AC00147AC00401AC00336AC00281AC00082AC00247AC00278AC00176AC00159AC00035AC00463AC00339AC00239AC00090AC00144AC00488AC00039AC00475AC00368AC00339AC00297AC00345AC00458AC00062AC00070AC00183AC00190AC00255AC00311AC00075AC00013AC00247AC00329AC00257AC00414AC00011AC00416AC00174AC00218AC00323AC00209AC00337AC00083AC00246AC00194AC00361AC00214AC00204AC00394AC00002AC00337AC00184AC00258AC00164AC00032AC00092AC00485AC00067AC00005AC00291AC00284AC00024AC00310AC00246AC00463AC00318AC00474AC00192AC00100AC00407AC00276AC00420AC00228AC00022AC00052AC00151AC00317AC00348AC00191AC00120AC00487AC00173AC00089AC00456AC00025AC00158AC00041AC00458AC00248AC00356AC00065AC00249AC00497AC00045AC00442AC00248AC00078AC00345AC00331AC00332AC00448AC00102AC00200AC00200AC00086AC00139AC00154AC00403AC00330AC00362AC00423AC00286AC00332AC00080AC00150AC00441AC00196AC00444AC00295AC00157AC00456AC00271AC00462AC00310AC00189AC00328AC00057AC00360AC00209AC00390AC00061AC00345AC00254AC00271AC00451AC00061AC00245AC00105AC00122AC00202AC00423AC00362AC00426AC00204AC00369AC00365AC00063AC00455AC00415AC00279AC00262AC00306AC00164AC00236AC00095AC00016AC00386AC00032AC00152AC00020AC00360AC00286AC00241AC00331AC00258AC00040AC00143AC00068AC00052AC00352AC00254AC00100AC00412AC00012AC00353AC00445AC00281AC00211AC00379AC00108AC00243AC00032AC00460AC00459AC00309AC00109AC00393AC00175AC00010AC00120AC00477AC00410AC00044AC00490AC00342AC00462AC00267AC00363AC00211AC00204AC00456AC00016AC00093AC00401AC00366AC00107AC00060AC00384AC00483AC00126AC00437AC00382AC00339AC00498AC00178AC00201AC00124AC00404AC00347AC00480AC00303AC00249AC00263AC00108AC00490AC00411AC00087AC00196AC00267AC00267AC00125AC00428AC00477AC00417AC00170AC00147AC00097AC00433AC00381AC00398AC00029AC00291AC00431AC00275AC00093AC00442AC00465AC00158AC00334AC00093AC00229AC00373AC00328AC00160AC00014AC00090AC00053AC00478AC00356AC00244AC00103AC00053AC00151AC00494AC00219AC00202AC00254AC00285AC00071AC00073AC00383AC00326AC00153AC00460AC00439AC00070AC00086AC00330AC00027AC00020AC00232AC00247AC00136AC00322AC00461AC00079AC00176AC00417AC00260AC00471AC00231AC00184AC00475AC00329AC00431AC00398AC00221AC00299AC00266AC00297AC00373AC00489AC00128AC00409AC00410AC00360AC00422AC00225AC00261AC00468AC00138AC00013AC00275AC00125AC00200AC00213AC00035AC00070AC00275AC00060AC00304AC00372AC00498AC00465AC00396AC00083AC00479AC00091AC00024AC00366AC00470AC00032AC00136AC00001AC00187AC00452AC00036AC00125AC00403AC00264AC00123AC00298AC00235AC00259AC00329AC00482AC00020AC00453AC00367AC00167AC00110AC00078AC00212AC00051AC00250AC00304AC00007AC00017AC00158AC00110AC00175AC00079AC00403AC00228AC00344AC00466AC00204AC00030AC00455AC00094AC00185AC00351AC00335AC00409AC00312AC00097AC00447AC00280AC00023AC00077AC00113AC00391AC00136AC00204AC00089AC00305AC00338AC00268AC00363AC00316AC00167AC00405AC00098AC00116AC00267AC00447AC00042AC00074AC00063AC00085AC00126AC00471AC00304AC00242AC00261AC00338AC00292AC00024AC00441AC00412AC00315AC00120AC00462AC00481AC00166AC00438AC00090AC00041AC00141AC00410AC00362AC00315AC00092AC00133AC00069AC00498AC00006AC00104AC00311AC00144AC00157AC00171AC00374AC00219AC00405AC00075AC00136AC00056AC00368AC00261AC00090AC00297AC00144AC00207AC00443AC00179AC00175AC00386AC00067AC00369AC00257AC00059AC00341AC00357AC00455AC00346AC00118AC00275AC00028AC00179AC00090AC00367AC00441AC00266AC00072AC00245AC00119AC00061AC00080AC00254AC00218AC00268AC00460AC00257AC00398AC00162AC00015AC00462AC00218AC00217AC00460AC00068AC00214AC00266AC00193AC00132AC00241AC00015AC00352AC00396AC00174AC00024AC00250AC00373AC00030AC00422AC00229AC00002AC00227AC00393AC00042AC00111AC00178AC00218AC00198AC00060AC00397AC00338AC00101AC00384AC00368AC00233AC00375AC00277AC00202AC00142AC00178AC00130AC00018AC00293AC00449AC00496AC00370AC00341AC00400AC00301AC00249AC00239AC00419AC00435AC00012AC00438AC00345AC00026AC00416AC00272AC00468AC00040AC00302AC00271AC00495AC00246AC00139AC00298AC00246AC00323AC00310AC00083AC00222AC00459AC00091AC00122AC00432AC00218AC00294AC00407AC00480AC00367AC00324AC00253AC00344AC00153AC00016AC00291AC00362AC00202AC00163AC00123AC00054AC00172AC00176AC00093AC00310AC00127AC00286AC00497AC00475AC00470AC00245AC00480AC00011AC00362AC00015AC00112AC00057AC00393AC00233AC00466AC00453AC00203AC00228AC00131AC00247AC00425AC00123AC00139AC00379AC00059AC00233AC00007AC00028AC00428AC00242AC00447AC00081AC00358AC00254AC00241AC00140AC00132AC00035AC00172AC00016AC00382AC00413AC00297AC00207AC00411AC00105AC00002AC00410AC00492AC00095AC00058AC00405AC00258AC00257AC00317AC00409AC00337AC00273AC00056AC00259AC00102AC00165AC00455AC00030AC00114AC00331AC00305AC00015AC00270AC00063AC00087AC00130AC00324AC00220AC00257AC00141AC00422AC00334AC00175AC00380AC00118AC00196AC00358AC00465AC00126AC00041AC00403AC00155AC00261AC00480AC00291AC00154AC00106AC00378AC00303AC00004AC00157AC00434AC00438AC00279AC00458AC00166AC00129AC00292AC00185AC00498AC00171AC00456AC00459AC00090AC00289AC00063AC00394AC00241AC00296AC00495AC00368AC00017AC00458AC00335AC00344AC00002AC00285AC00091AC00399AC00133AC00322AC00247AC00332AC00230AC00019AC00446AC00219AC00050AC00449AC00426AC00354AC00313AC00442AC00318AC00232AC00424AC00393AC00175AC00108AC00067AC00356AC00244AC00244AC00411AC00294AC00205AC00155AC00305AC00048AC00366AC00185AC00215AC00283AC00197AC00390AC00320AC00020AC00500AC00289AC00121AC00023AC00490AC00197AC00494AC00193AC00451AC00261AC00246AC00120AC00189AC00499AC00287AC00087AC00333AC00423AC00108AC00099AC00387AC00010AC00087AC00294AC00408AC00495AC00304AC00053AC00432AC00200AC00438AC00151AC00305AC00142AC00356AC00155AC00265AC00122AC00426AC00111AC00405AC00128AC00050AC00141AC00010AC00259AC00302AC00124AC00464AC00473AC00486AC00348AC00398AC00267AC00076AC00022AC00181AC00071AC00369AC00358AC00021AC00156AC00212AC00327AC00349AC00183AC00478AC00398AC00380AC00030AC00341AC00162AC00319AC00338AC00374AC00451AC00005AC00263AC00432AC00274AC00103AC00146AC00359AC00164AC00079AC00209AC00018AC00401AC00179AC00274AC00354AC00025AC00493AC00339AC00454AC00152AC00314AC00054AC00036AC00397AC00355AC00188AC00222AC00094AC00004AC00021AC00102AC00367AC00137AC00289AC00458AC00493AC00059AC00281AC00020AC00396AC00402AC00385AC00167AC00349AC00153AC00197AC00219AC00255AC00035AC00003AC00136AC00337AC00265AC00356AC00305AC00300AC00301AC00154AC00201AC00004AC00237AC00304AC00086AC00483AC00288AC00134AC00130AC00393AC00029AC00228AC00131AC00233AC00017AC00071AC00128AC00073AC00353AC00209AC00396AC00439AC00063AC00203AC00394AC00258AC00144AC00169AC00285AC00047AC00460AC00073AC00253AC00320AC00462AC00071AC00363AC00448AC00206AC00181AC00213AC00470AC00451AC00192AC00378AC00143AC00499AC00168AC00085AC00144AC00185AC00338AC00384AC00336AC00003AC00378AC00217AC00280AC00184AC00456AC00151AC00366AC00383AC00076AC00290AC00347AC00034AC00373AC00382AC00463AC00030AC00431AC00243AC00499AC00065AC00492AC00062AC00026AC00201AC00377AC00269AC00239AC00070AC00284AC00337AC00051AC00328AC00064AC00231AC00362AC00363AC00196AC00360AC00499AC00213AC00165AC00299AC00119AC00409AC00100AC00244AC00382AC00389AC00350AC00275AC00231AC00449AC00248AC00301AC00023AC00272AC00377AC00276AC00427AC00220AC00370AC00187AC00293AC00203AC00016AC00189AC00005AC00190AC00480AC00407AC00195AC00260AC00006AC00294AC00327AC00013AC00386AC00256AC00439AC00088AC00303AC00245AC00188AC00225AC00278AC00311AC00206AC00329AC00338AC00409AC00114AC00460AC00373AC00021AC00235AC00261AC00456AC00332AC00245AC00388AC00008AC00066AC00227AC00352AC00481AC00341AC00065AC00248AC00177AC00139AC00252AC00286AC00001AC00453AC00291AC00113AC00390AC00345AC00011AC00106AC00087AC00441AC00069AC00020AC00172AC00237AC00186AC00165AC00438AC00435AC00472AC00446AC00427AC00230AC00172AC00103AC00467AC00094AC00231AC00032AC00241AC00263AC00012AC00009AC00041AC00317AC00475AC00181AC00120AC00331AC00500AC00292AC00436AC00476AC00391AC00267AC00034AC00425AC00048AC00110AC00427AC00483AC00310AC00179AC00363AC00080AC00158AC00033AC00492AC00228AC00058AC00427AC00236AC00144AC00256AC00054AC00492AC00498AC00005AC00204AC00428AC00019AC00029AC00449AC00213AC00076AC00442AC00228AC00081AC00072AC00261AC00082AC00334AC00251AC00481AC00239AC00394AC00374AC00054AC00281AC00140AC00058AC00066AC00190AC00452AC00099AC00126AC00021AC00353AC00482AC00427AC00430AC00043AC00186AC00248AC00069AC00002AC00090AC00210AC00014AC00039AC00021AC00242AC00413AC00082AC00304AC00098AC00372AC00274AC00080AC00064AC00241AC00055AC00215AC00037AC00325AC00394AC00164AC00096AC00402AC00077AC00116AC00145AC00457AC00021AC00110AC00284AC00012AC00065AC00439AC00157AC00013AC00355AC00155AC00105AC00216AC00297AC00139AC00205AC00148AC00064AC00344AC00448AC00446AC00460AC00020AC00202AC00458AC00139AC00210AC00399AC00267AC00414AC00272AC00220AC00276AC00084AC00189AC00209AC00324AC00159AC00497AC00257AC00353AC00115AC00478AC00497AC00131AC00059AC00101AC00418AC00384AC00496AC00145AC00131AC00224AC00219AC00385AC00133AC00056AC00358AC00419AC00464AC00430AC00071AC00404AC00349AC00235AC00228AC00024AC00330AC00290AC00042AC00203AC00248AC00140AC00095AC00225AC00445AC00385AC00143AC00448AC00273AC00291AC00337AC00332AC00040AC00089AC00231AC00277AC00465AC00205AC00036AC00227AC00251AC00298AC00315AC00292AC00252AC00400AC00384AC00329AC00033AC00003AC00429AC00488AC00325AC00359AC00478AC00236AC00085AC00349AC00314AC00056AC00399AC00324AC00095AC00086AC00279AC00431AC00369AC00089AC00035AC00416AC00056AC00197AC00396AC00295AC00268AC00357AC00490AC00367AC00093AC00177AC00118AC00183AC00137AC00225AC00191AC00373AC00167AC00149AC00453AC00253AC00149AC00359AC00050AC00054AC00388AC00322AC00140AC00138AC00408AC00219AC00339AC00446AC00038AC00387AC00126AC00155AC00494AC00202AC00489AC00378AC00255AC00331AC00243AC00066AC00166AC00288AC00005AC00191AC00443AC00096AC00225AC00042AC00460AC00378AC00322AC00328AC00337AC00003AC00146AC00004AC00386AC00187AC00415AC00315AC00297AC00304AC00016AC00048AC00485AC00064AC00081AC00228AC00204AC00144AC00420AC00448AC00363AC00396AC00456AC00363AC00326AC00500AC00076AC00063AC00376AC00392AC00282AC00238AC00130AC00194AC00132AC00071AC00457AC00153AC00003AC00327AC00362AC00195AC00010AC00032AC00202AC00262AC00158AC00465AC00260AC00130AC00500AC00384AC00027AC00176AC00150AC00443AC00098AC00452AC00119AC00342AC00190AC00075AC00450AC00300AC00183AC00079AC00004AC00063AC00345AC00177AC00165AC00097AC00241AC00308AC00308AC00088AC00370AC00073AC00452AC00111AC00150AC00103AC00176AC00194AC00225AC00267AC00129AC00362AC00155AC00019AC00028AC00459AC00069AC00046AC00067AC00257AC00215AC00152AC00306AC00340AC00116AC00306AC00032AC00374AC00303AC00464AC00245AC00373AC00386AC00078AC00424AC00033AC00046AC00298AC00432AC00439AC00407AC00004AC00079AC00356AC00269AC00439AC00014AC00140AC00460AC00494AC00325AC00150AC00242AC00222AC00170AC00043AC00487AC00315AC00312AC00483AC00430AC00320AC00464AC00436AC00279AC00436AC00251AC00412AC00284AC00054AC00128AC00236AC00275AC00122AC00302AC00058AC00058AC00042AC00276AC00275AC00070AC00317AC00347AC00179AC00172AC00051AC00202AC00125AC00311AC00116AC00265AC00493AC00057AC00279AC00322AC00182AC00432AC00030AC00407AC00089AC00375AC00216AC00069AC00427AC00047AC00377AC00258AC00297AC00322AC00095AC00118AC00009'] to numeric

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Drop irrelevant columns
columns_to_drop = ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']
data = data.drop(columns=columns_to_drop, axis=1)

# Encode the target variable
# Ensure that 'TransactionType' is properly mapped and handle missing values before conversion
data['TransactionType'] = data['TransactionType'].map({'Non-Fraud': 0, 'Fraud': 1})

# Check if any missing values in 'TransactionType'
if data['TransactionType'].isnull().any():
    print("Missing values found in 'TransactionType', filling with 0 (Non-Fraud).")
    data['TransactionType'].fillna(0, inplace=True)  # Fill missing values with 0 for non-fraud

# One-hot encode categorical columns
categorical_columns = ['Location', 'MerchantID', 'Channel', 'CustomerOccupation']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Fill missing values in features
data = data.fillna(data.mean())  # Fill missing numeric features with the mean

# Ensure 'Fraud' column is numeric and free of issues
y = data['TransactionType'].astype(int)

# Define features for logistic regression (use only numeric columns after encoding)
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge']
features.extend([col for col in data.columns if col not in ['TransactionType']])  # Add one-hot encoded columns

# Prepare the dataset for modeling
X = data[features]

# Check if X contains any non-numeric data
if X.select_dtypes(include='object').shape[1] > 0:
    print("Non-numeric columns detected in features. Converting to numeric.")
    X = X.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric values to NaN

# Handle any missing values that may appear after conversion
X.fillna(X.mean(), inplace=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)

# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = 'log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)


KeyError: "['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID'] not found in axis"

In [27]:
data.columns = data.columns.str.strip()  # Remove any leading/trailing spaces


In [28]:
# Drop irrelevant columns if they exist
columns_to_drop = ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns], axis=1)


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Drop irrelevant columns
columns_to_drop = ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']
data = data.drop(columns=columns_to_drop, axis=1)

# Encode the target variable
# Ensure that 'TransactionType' is properly mapped and handle missing values before conversion
data['TransactionType'] = data['TransactionType'].map({'Non-Fraud': 0, 'Fraud': 1})

# Check if any missing values in 'TransactionType'
if data['TransactionType'].isnull().any():
    print("Missing values found in 'TransactionType', filling with 0 (Non-Fraud).")
    data['TransactionType'].fillna(0, inplace=True)  # Fill missing values with 0 for non-fraud

# One-hot encode categorical columns
categorical_columns = ['Location', 'MerchantID', 'Channel', 'CustomerOccupation']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Fill missing values in features
data = data.fillna(data.mean())  # Fill missing numeric features with the mean

# Ensure 'Fraud' column is numeric and free of issues
y = data['TransactionType'].astype(int)

# Define features for logistic regression (use only numeric columns after encoding)
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge']
features.extend([col for col in data.columns if col not in ['TransactionType']])  # Add one-hot encoded columns

# Prepare the dataset for modeling
X = data[features]

# Check if X contains any non-numeric data
if X.select_dtypes(include='object').shape[1] > 0:
    print("Non-numeric columns detected in features. Converting to numeric.")
    X = X.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric values to NaN

# Handle any missing values that may appear after conversion
X.fillna(X.mean(), inplace=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)

# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = 'log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)


KeyError: "['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID'] not found in axis"

In [30]:
columns_to_drop = ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']
columns_to_drop = [col for col in columns_to_drop if col in data.columns]  # Drop only existing columns
data = data.drop(columns=columns_to_drop, axis=1)


In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Drop irrelevant columns
columns_to_drop = ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']
data = data.drop(columns=columns_to_drop, axis=1)

# Encode the target variable
# Ensure that 'TransactionType' is properly mapped and handle missing values before conversion
data['TransactionType'] = data['TransactionType'].map({'Non-Fraud': 0, 'Fraud': 1})

# Check if any missing values in 'TransactionType'
if data['TransactionType'].isnull().any():
    print("Missing values found in 'TransactionType', filling with 0 (Non-Fraud).")
    data['TransactionType'].fillna(0, inplace=True)  # Fill missing values with 0 for non-fraud

# One-hot encode categorical columns
categorical_columns = ['Location', 'MerchantID', 'Channel', 'CustomerOccupation']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Fill missing values in features
data = data.fillna(data.mean())  # Fill missing numeric features with the mean

# Ensure 'Fraud' column is numeric and free of issues
y = data['TransactionType'].astype(int)

# Define features for logistic regression (use only numeric columns after encoding)
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge']
features.extend([col for col in data.columns if col not in ['TransactionType']])  # Add one-hot encoded columns

# Prepare the dataset for modeling
X = data[features]

# Check if X contains any non-numeric data
if X.select_dtypes(include='object').shape[1] > 0:
    print("Non-numeric columns detected in features. Converting to numeric.")
    X = X.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric values to NaN

# Handle any missing values that may appear after conversion
X.fillna(X.mean(), inplace=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)

# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = 'log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)


KeyError: "['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID'] not found in axis"

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare the dataset
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge', 'TimeSinceLastTransaction']  # Update this as needed
target = 'TransactionType'

X = data[features]
y = data[target].map({'Non-Fraud': 0, 'Fraud': 1})  # Encode target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)
data['Fraud'] |= data['LogReg_Fraud']

# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = '/kaggle/working/log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)


ValueError: Input y contains NaN.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare the dataset
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge', 'TimeSinceLastTransaction']  # Update this as needed
target = 'TransactionType'

X = data[features]
y = data[target].map({'Non-Fraud': 0, 'Fraud': 1})  # Encode target variable

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)
data['Fraud'] |= data['LogReg_Fraud']

# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = '/kaggle/working/log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)


ValueError: Input y contains NaN.

In [32]:
columns_to_drop = ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']
missing_cols = [col for col in columns_to_drop if col not in data.columns]
if missing_cols:
    print(f"Columns not found in DataFrame: {missing_cols}")
data = data.drop(columns=[col for col in columns_to_drop if col in data.columns], axis=1)


Columns not found in DataFrame: ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']


In [33]:
for col in data.columns:
    print(f"'{col}': {len(col)}")


'AccountID': 9
'TransactionAmount': 17
'TransactionType': 15
'CustomerAge': 11
'TransactionDuration': 19
'LoginAttempts': 13
'AccountBalance': 14
'TimeSinceLastTransaction': 24
'Location_Atlanta': 16
'Location_Austin': 15
'Location_Baltimore': 18
'Location_Boston': 15
'Location_Charlotte': 18
'Location_Chicago': 16
'Location_Colorado Springs': 25
'Location_Columbus': 17
'Location_Dallas': 15
'Location_Denver': 15
'Location_Detroit': 16
'Location_El Paso': 16
'Location_Fort Worth': 19
'Location_Fresno': 15
'Location_Houston': 16
'Location_Indianapolis': 21
'Location_Jacksonville': 21
'Location_Kansas City': 20
'Location_Las Vegas': 18
'Location_Los Angeles': 20
'Location_Louisville': 19
'Location_Memphis': 16
'Location_Mesa': 13
'Location_Miami': 14
'Location_Milwaukee': 18
'Location_Nashville': 18
'Location_New York': 17
'Location_Oklahoma City': 22
'Location_Omaha': 14
'Location_Philadelphia': 21
'Location_Phoenix': 16
'Location_Portland': 17
'Location_Raleigh': 16
'Location_Sacrament

In [34]:
columns_to_drop = ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']
# Check which columns exist in the DataFrame
columns_to_drop_existing = [col for col in columns_to_drop if col in data.columns]
# Drop only those that exist
data = data.drop(columns=columns_to_drop_existing, axis=1)
print(f"Dropped columns: {columns_to_drop_existing}")


Dropped columns: []


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Drop irrelevant columns
columns_to_drop = ['TransactionID', 'PreviousTransactionDate', 'TransactionDate', 'IP Address', 'DeviceID']
data = data.drop(columns=columns_to_drop, axis=1)

# Encode the target variable
# Ensure that 'TransactionType' is properly mapped and handle missing values before conversion
data['TransactionType'] = data['TransactionType'].map({'Non-Fraud': 0, 'Fraud': 1})

# Check if any missing values in 'TransactionType'
if data['TransactionType'].isnull().any():
    print("Missing values found in 'TransactionType', filling with 0 (Non-Fraud).")
    data['TransactionType'].fillna(0, inplace=True)  # Fill missing values with 0 for non-fraud

# One-hot encode categorical columns
categorical_columns = ['Location', 'MerchantID', 'Channel', 'CustomerOccupation']
data = pd.get_dummies(data, columns=categorical_columns, drop_first=True)

# Fill missing values in features
data = data.fillna(data.mean())  # Fill missing numeric features with the mean

# Ensure 'Fraud' column is numeric and free of issues
y = data['TransactionType'].astype(int)

# Define features for logistic regression (use only numeric columns after encoding)
features = ['TransactionAmount', 'TransactionDuration', 'LoginAttempts', 'AccountBalance', 'CustomerAge']
features.extend([col for col in data.columns if col not in ['TransactionType']])  # Add one-hot encoded columns

# Prepare the dataset for modeling
X = data[features]

# Check if X contains any non-numeric data
if X.select_dtypes(include='object').shape[1] > 0:
    print("Non-numeric columns detected in features. Converting to numeric.")
    X = X.apply(pd.to_numeric, errors='coerce')  # Convert non-numeric values to NaN

# Handle any missing values that may appear after conversion
X.fillna(X.mean(), inplace=True)

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Logistic Regression model
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train, y_train)

# Predict fraud on the test set
y_pred = log_reg.predict(X_test)

# Evaluate model performance
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Non-Fraud', 'Fraud'], yticklabels=['Non-Fraud', 'Fraud'])
plt.title('Confusion Matrix', fontsize=16)
plt.xlabel('Predicted', fontsize=14)
plt.ylabel('Actual', fontsize=14)
plt.show()

# Add predictions to the dataset
data['LogReg_Fraud'] = log_reg.predict(X_scaled)

# Visualize fraud vs. non-fraud transactions
plt.figure(figsize=(12, 8))
sns.scatterplot(
    x=data['TransactionAmount'],
    y=data['AccountBalance'],
    hue=data['LogReg_Fraud'],
    palette={1: 'red', 0: 'blue'},
    alpha=0.7
)
plt.title('Logistic Regression Fraud Detection', fontsize=16)
plt.xlabel('Transaction Amount', fontsize=14)
plt.ylabel('Account Balance', fontsize=14)
plt.legend(title='Fraud', labels=['Non-Fraud', 'Fraud'], fontsize=12)
plt.grid(True)
plt.show()

# Save fraudulent transactions detected by Logistic Regression
log_reg_fraud_output_path = 'log_reg_fraud_transactions.csv'
log_reg_fraud_points = data[data['LogReg_Fraud'] == 1]
log_reg_fraud_points.to_csv(log_reg_fraud_output_path, index=False)

# Summary
total_log_reg_fraud_points = log_reg_fraud_points.shape[0]
print(f"Total Fraudulent Transactions Detected by Logistic Regression: {total_log_reg_fraud_points}")
print(f"Fraudulent transactions saved to: {log_reg_fraud_output_path}")
print(log_reg_fraud_points)


KeyError: "None of [Index(['Location', 'MerchantID', 'Channel', 'CustomerOccupation'], dtype='object')] are in the [columns]"

In [38]:
import pandas as pd
import numpy as np
from random import choice

# Generate a dataset with 800 rows
np.random.seed(42)

# Data for professions
professions = ['Doctor', 'Lawyer', 'Engineer', 'Teacher', 'Artist', 'Nurse']

# Generate random data for each column
data = {
    'Profession': [choice(professions) for _ in range(800)],
    'Income': np.random.randint(30000, 150000, 800),
    'Credit_card_number': [''.join([str(np.random.randint(0, 10)) for _ in range(16)]) for _ in range(800)],
    'Expiry': [f"{np.random.randint(1, 13):02}/{np.random.randint(21, 30)}" for _ in range(800)],
    'Security_Code': [np.random.randint(100, 999) for _ in range(800)],
    'Fraud': np.random.choice([0, 1], 800, p=[0.85, 0.15])  # Imbalanced classes, 15% fraud
}

# Create a DataFrame
df = pd.DataFrame(data)

# Introduce some missing values randomly
for column in ['Income', 'Credit_card_number', 'Expiry', 'Security_Code']:
    missing_indices = np.random.choice(df.index, size=int(0.1 * len(df)), replace=False)
    df.loc[missing_indices, column] = np.nan

# Show the first few rows of the dataset
df.head()


Unnamed: 0,Profession,Income,Credit_card_number,Expiry,Security_Code,Fraud
0,Nurse,45795.0,9144527053068336.0,01/25,410.0,0
1,Nurse,30860.0,,02/29,373.0,0
2,Artist,133694.0,8087054594544322.0,07/28,134.0,0
3,Lawyer,,3818004552689757.0,06/29,316.0,0
4,Lawyer,140268.0,4793979148350804.0,06/27,820.0,0


In [40]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('fraud-detection-dataset.csv')

# Check missing values
print("Missing values before handling:")
print(data.isnull().sum())

# Handle missing values
# Fill missing categorical column 'Profession' with the mode
data['Profession'] = data['Profession'].fillna(data['Profession'].mode()[0])

# Fill missing numerical columns with the mean
data['Income'] = data['Income'].fillna(data['Income'].mean())
data['Security_Code'] = data['Security_Code'].fillna(data['Security_Code'].mean())

# Fill missing target column 'Fraud' with the mode
data['Fraud'] = data['Fraud'].fillna(data['Fraud'].mode()[0])

# Fill missing 'Expiry' with the mode
data['Expiry'] = data['Expiry'].fillna(data['Expiry'].mode()[0])

# Feature Engineering: Create 'Months_to_expiry' column
current_date = datetime.now()

def calculate_months_to_expiry(expiry):
    exp_month, exp_year = map(int, expiry.split("/"))
    expiry_date = datetime(year=2000 + exp_year, month=exp_month, day=1)
    return max(0, (expiry_date.year - current_date.year) * 12 + (expiry_date.month - current_date.month))

data['Months_to_expiry'] = data['Expiry'].apply(calculate_months_to_expiry)

# One-hot encode the 'Profession' column
data = pd.get_dummies(data, columns=['Profession'], drop_first=True)

# Drop unnecessary columns
data = data.drop(columns=['Credit_card_number', 'Expiry'])

# Recheck for any remaining missing values
print("Missing values after handling:")
print(data.isnull().sum())

# Split the data into features (X) and target (y)
X = data.drop(columns=['Fraud'])
y = data['Fraud']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Train a Random Forest model using Gini index
model = RandomForestClassifier(criterion='gini', random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Feature importance visualization
feature_importances = pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort_values(ascending=False).plot(kind='bar', title='Feature Importance', figsize=(10, 6))
plt.show()


ImportError: cannot import name '_check_n_features' from 'sklearn.utils.validation' (c:\Users\LENOVO\anaconda3\Lib\site-packages\sklearn\utils\validation.py)