In [None]:
#Importing the Neccesary Libralies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import joblib

In [None]:
#Reading the data into the notebook
data = pd.read_csv(r'E:\CSV\CreditCardData.csv')
data

In [None]:
data['Amount'] = data['Amount'].str.replace('£', '').str.replace('[^\d.]', '', regex=True)
data['Amount'] = data['Amount'].astype(float)
data['Age'] = data['Age'].astype(int)
data_types = data.dtypes
print(data_types)

In [None]:
#Checking for Null Values
data.isna().sum()

In [None]:
#Checking the Datatypes of the Columns
data_types = data.dtypes
print(data_types)

In [None]:
data.info()

In [None]:
#Printing descriptive statistics
statistics = data.describe()
statistics = data.describe()
print(statistics)

In [None]:
#Identifying Uniques Categories in Bank Variable
categorical_variable = 'Bank'

unique_values = data[categorical_variable].unique()
value_counts = data[categorical_variable].value_counts()

print(f"Unique values of '{categorical_variable}':")
print(unique_values)
print("\nFrequencies:")
print(value_counts)

In [None]:
# Creating Descriptives of Subsets Above 300 in Amount Variable
threshold_amount = 300  # Modify this threshold as needed

high_amount_subset = data[data['Amount'] > threshold_amount]

print("Descriptives of Subsets Above 300 in Amount Variable:")
print(high_amount_subset.describe())

# Creating Descriptives of Subsets Above below 50 in Amount Variable
threshold_amount = 10  # Modify this threshold as needed

high_amount_subset = data[data['Amount'] < threshold_amount]

print("Descriptives of Subsets below 10 in Amount Variable:")
print(high_amount_subset.describe())

In [None]:
# Grouping data by Merchant Group Categorical Variable 
grouped_data = data.groupby('Merchant Group')

summary_stats = grouped_data['Amount'].describe()

print("Summarized Grouped Information by Merchant Group:")
print(summary_stats)

In [None]:
#Comparing Means in Amount of Transactions in United Kingdom and India
from scipy.stats import ttest_ind

uk_subset = data[data['Country of Transaction'] == 'United Kingdom']
usa_subset = data[data['Country of Transaction'] == 'India']


t_statistic, p_value = ttest_ind(uk_subset['Amount'], usa_subset['Amount'], equal_var=False)


print(f"Two-Sample T-test Statistic: {t_statistic}")
print(f"P-value: {p_value}")

alpha = 0.05


if p_value < alpha:
    print("There is a significant difference in mean transaction amounts between UK and India.")
else:
    print("There is no significant difference in mean transaction amounts between UK and India.")

In [None]:
# Checking for Difference in Mean Amounts for Fraudulent and Genuine Transactions
fraud_subset = data[data['Fraud'] == 1]
no_fraud_subset = data[data['Fraud'] == 0]

t_statistic, p_value = ttest_ind(fraud_subset['Amount'], no_fraud_subset['Amount'], equal_var=False)

print(f"Two-Sample T-test Statistic: {t_statistic}")
print(f"P-value: {p_value}")

alpha = 0.05

if p_value < alpha:
    print("There is a significant difference in mean transaction amounts between 'Fraud' and 'No Fraud' transactions.")
else:
    print("There is no significant difference in mean transaction amounts between 'Fraud' and 'No Fraud' transactions.")

In [None]:
# Visualize the count of fraud transactions by Entry Mode
plt.figure(figsize=(8, 5))
sns.countplot(data=data, x='Entry Mode', hue='Fraud')
plt.title('Count of Fraud Transactions by Entry Mode')
plt.xlabel('Entry Mode')
plt.ylabel('Count')
plt.legend(title='Fraud', labels=['No', 'Yes'])
plt.xticks(rotation=45)
plt.show()

In [None]:
#Distribution of Fraud Based on Amoun
plt.figure(figsize=(10, 6))
sns.boxplot(x='Fraud', y='Amount', data=data)
plt.title('Transaction Amount Distribution by Fraud Status')
plt.xlabel('Fraud')
plt.ylabel('Amount')
plt.xticks([0, 1], ['No Fraud', 'Fraud'])
plt.show()

In [None]:
# Creating bar graphs to Compare Age Groups
age_groups = [(20, 29), (30, 39), (40, 49), (50, 59), (60, 69), (70, 79)]
age_labels = ['20s', '30s', '40s', '50s', '60s', '70s']
fraud_probabilities = []
not_fraud_probabilities = []

for age_group in age_groups:
    lower_age, upper_age = age_group
    subset = data[(data['Age'] >= lower_age) & (data['Age'] <= upper_age)]
    total_transactions = len(subset)
    fraud_transactions = len(subset[subset['Fraud'] == 1])
    not_fraud_transactions = len(subset[subset['Fraud'] == 0])
    fraud_probability = fraud_transactions / total_transactions
    not_fraud_probability = not_fraud_transactions / total_transactions
    fraud_probabilities.append(fraud_probability)
    not_fraud_probabilities.append(not_fraud_probability)

x = np.arange(len(age_labels))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
ax.bar(x, fraud_probabilities, width, label='Fraud', alpha=0.7)
ax.bar(x, not_fraud_probabilities, width, label='Not Fraud', bottom=fraud_probabilities, alpha=0.7)

ax.set_xlabel('Age Group')
ax.set_ylabel('Probability')
ax.set_title('Fraud and Not Fraud Probability by Age Group')
ax.set_xticks(x)
ax.set_xticklabels(age_labels)
ax.legend()

plt.tight_layout()
plt.show()

In [None]:
# Probabilities of fraud by time
fraud_probabilities = data.groupby('Time')['Fraud'].mean().reset_index()

plt.figure(figsize=(12, 6))
ax = sns.barplot(data=fraud_probabilities, x='Time', y='Fraud', palette='Blues')
ax.set_title('Probability of Fraud by Time')
ax.set_xlabel('Time')
ax.set_ylabel('Probability of Fraud')
ax.set_xticklabels(fraud_probabilities['Time'], rotation=45, ha='right')
ax.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
# Create a crosstab to get the counts
ct = pd.crosstab(data['Merchant Group'], data['Fraud'])

# Plot a stacked bar chart
plt.figure(figsize=(12, 6))
ct.plot(kind='bar', stacked=True, colormap='Set3')
plt.title('Transaction Frequency by Merchant Group')
plt.xlabel('Merchant Group')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.legend(title='Fraud', labels=['Not Fraud', 'Fraud'])
plt.tight_layout()
plt.show()

In [None]:
# Calculate the fraud rates for Country of Residence Transaction and Shipping adress 
fraud_by_transaction_country = data.groupby('Country of Transaction')['Fraud'].mean().reset_index()
fraud_by_shipping_address = data.groupby('Shipping Address')['Fraud'].mean().reset_index()
fraud_by_residence_country = data.groupby('Country of Residence')['Fraud'].mean().reset_index()

fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 12))
fig.subplots_adjust(hspace=0.5)

sns.barplot(data=fraud_by_transaction_country, x='Country of Transaction', y='Fraud', ax=axes[0])
axes[0].set_title('Fraud Rate by Country of Transaction')
axes[0].set_xlabel('Country of Transaction')
axes[0].set_ylabel('Fraud Rate')

sns.barplot(data=fraud_by_shipping_address, x='Shipping Address', y='Fraud', ax=axes[1])
axes[1].set_title('Fraud Rate by Shipping Address')
axes[1].set_xlabel('Shipping Address')
axes[1].set_ylabel('Fraud Rate')
axes[1].tick_params(axis='x', rotation=45)

sns.barplot(data=fraud_by_residence_country, x='Country of Residence', y='Fraud', ax=axes[2])
axes[2].set_title('Fraud Rate by Country of Residence')
axes[2].set_xlabel('Country of Residence')
axes[2].set_ylabel('Fraud Rate')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Calculate the fraud rates for Country of Residence Transaction and Shipping address
fraud_by_transaction_country = data.groupby('Country of Transaction')['Fraud'].mean().reset_index()
fraud_by_shipping_address = data.groupby('Shipping Address')['Fraud'].mean().reset_index()
fraud_by_residence_country = data.groupby('Country of Residence')['Fraud'].mean().reset_index()

fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(8, 10))
fig.subplots_adjust(hspace=0.5)

sns.barplot(data=fraud_by_transaction_country, x='Country of Transaction', y='Fraud', ax=axes[0])
axes[0].set_title('Fraud Rate by Country of Transaction')
axes[0].set_xlabel('Country of Transaction')
axes[0].set_ylabel('Fraud Rate')

sns.barplot(data=fraud_by_shipping_address, x='Shipping Address', y='Fraud', ax=axes[1])
axes[1].set_title('Fraud Rate by Shipping Address')
axes[1].set_xlabel('Shipping Address')
axes[1].set_ylabel('Fraud Rate')
axes[1].tick_params(axis='x', rotation=45)

sns.barplot(data=fraud_by_residence_country, x='Country of Residence', y='Fraud', ax=axes[2])
axes[2].set_title('Fraud Rate by Country of Residence')
axes[2].set_xlabel('Country of Residence')
axes[2].set_ylabel('Fraud Rate')
axes[2].tick_params(axis='x', rotation=45)

fig.suptitle('Fraud Rates by Location', fontsize=16)

plt.tight_layout()
plt.show()

In [None]:
#Identifying fraud rate by gender
gender_fraud_counts = data.groupby(['Gender', 'Fraud']).size().unstack(fill_value=0)
fraud_rate = (gender_fraud_counts[1] / (gender_fraud_counts[0] + gender_fraud_counts[1])) * 100
plt.figure(figsize=(8, 6))
sns.set(style="whitegrid")
sns.barplot(x=fraud_rate.index, y=fraud_rate, palette="Set2")
plt.xlabel("Gender")
plt.ylabel("Fraud Rate (%)")
plt.title("Fraud Rate Comparison by Gender")
plt.xticks(range(len(fraud_rate.index)), ['Female', 'Male'])
plt.show()

In [None]:
#Identifying fraudrate by bank
bank_fraud_counts = data.groupby(['Bank', 'Fraud']).size().unstack(fill_value=0)
fraud_rate = (bank_fraud_counts[1] / (bank_fraud_counts[0] + bank_fraud_counts[1])) * 100
fraud_rate_sorted = fraud_rate.sort_values(ascending=False)
plt.figure(figsize=(12, 8))
sns.set(style="whitegrid")
sns.barplot(x=fraud_rate_sorted.values, y=fraud_rate_sorted.index, palette="Set2")
plt.xlabel("Fraud Rate (%)")
plt.ylabel("Bank")
plt.title("Fraud Rate Comparison by Bank")
plt.show()

In [None]:
#Encoding Categorical Variables
categorical_columns = ['Type of Card', 'Entry Mode', 'Type of Transaction', 'Merchant Group', 'Country of Transaction', 'Shipping Address', 'Country of Residence', 'Gender', 'Bank']
data_encoded = pd.get_dummies(data, columns=categorical_columns, drop_first=True)
data_encoded

In [None]:
# Training the Logistic Regression Model

data_encoded.drop(['Transaction ID', 'Date', 'Day of Week'], axis=1, inplace=True)

X = data_encoded.drop('Fraud', axis=1)
y = data_encoded['Fraud']



X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print(classification_report(y_test, y_pred))

In [None]:
#Training the random forest classifier
X = data_encoded.drop('Fraud', axis=1)
y = data_encoded['Fraud']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Classifier model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# You can also print a classification report for more detailed evaluation
print(classification_report(y_test, y_pred))