In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

pd.set_option("display.max_columns",None)

# Loading Data

In [None]:
df = pd.read_csv("/kaggle/input/comprehensive-credit-card-transactions-dataset/credit_card_transaction_flow.csv")
print(df.shape)
df.head(3)

# Data Observation

In [None]:
df.info()

In [None]:
df.describe(include="all")

In [None]:
df["Gender"].fillna("None",inplace=True)

In [None]:
print(df["Gender"].value_counts())
label = ["F","M","None"]
plt.pie(x=df["Gender"].value_counts(),labels=label,startangle=90,autopct="%1.1f%%")

In [None]:
sns.boxplot(x=df["Gender"],y=df["Transaction Amount"])

In [None]:
df.groupby("Gender").agg({"Transaction Amount":["min","max","mean","std"]})

In [None]:
df.groupby("Gender")["Category"].value_counts().unstack()

In [None]:
sns.boxplot(x=df["Category"],y=df["Transaction Amount"])

## The range of costs varies by category.
## Travel and electronics have large cost ranges, while restaurant and cosmetic have small ranges.

In [None]:
df['Birthdate'] = pd.to_datetime(df['Birthdate'], format='%d-%m-%Y')
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')
df['Age'] = (df['Date'] - df['Birthdate']).dt.days / 365
df.head(3)

In [None]:
sns.scatterplot(x=df["Age"],y=df["Transaction Amount"])

In [None]:
sns.boxplot(x=df["Category"],y=df["Age"])

In [None]:
df.groupby("Category").agg({"Age":["min","max","mean","std"]})

## I would have thought that the amount of transactions and what they are buying would change with age, but that does not seem to be the case.

In [None]:
df['year'] = df['Date'].dt.year
df['month'] = df['Date'].dt.month
df['day'] = df['Date'].dt.day
df['day_of_week'] = df['Date'].dt.day_name()

In [None]:
df["year"].unique()

In [None]:
df["month"].value_counts()

In [None]:
sns.boxplot(x=df["month"],y=df["Transaction Amount"])

In [None]:
tmp_df = df.groupby("month").agg({"Transaction Amount":"mean"})
sns.lineplot(tmp_df["Transaction Amount"])

In [None]:
df["day"].value_counts()

In [None]:
sns.boxplot(x=df["day"],y=df["Transaction Amount"])

In [None]:
tmp_df = df.groupby("day").agg({"Transaction Amount": "mean"})
sns.lineplot(x=tmp_df.index, y=tmp_df["Transaction Amount"])

## Does it have anything to do with credit card balance renewal dates, payday, etc.?

In [None]:
df["day_of_week"].value_counts()

In [None]:
sns.boxplot(x=df["day_of_week"],y=df["Transaction Amount"])

In [None]:
days_in_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
tmp_df = df.groupby("day_of_week").agg({"Transaction Amount": "mean"}).reindex(days_in_order)
sns.lineplot(x=tmp_df.index, y=tmp_df["Transaction Amount"])

## Monday seems to have a notably high transaction amount.