In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import scipy
from scipy.stats import zscore


In [19]:
df1 = pd.read_csv("e:/2025/ExtensoData_Internship_Work/Projects/Task_3_EDA/data/cleaned_transactions.csv")
print(df1.info())

# changing datetime columun to datetime format
df1['transactionDateTime'] = pd.to_datetime(df1['transactionDateTime'])
df1["accountOpenDate"] = pd.to_datetime(df1["accountOpenDate"])
df1['currentExpDate'] = pd.to_datetime(df1['currentExpDate'])
df1['dateOfLastAddressChange'] = pd.to_datetime(df1['dateOfLastAddressChange'])
df1['transactionDateTime'].dtype
print(df1.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641914 entries, 0 to 641913
Data columns (total 26 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accountNumber             641914 non-null  int64  
 1   customerId                641914 non-null  int64  
 2   creditLimit               641914 non-null  int64  
 3   availableMoney            641914 non-null  float64
 4   transactionDateTime       641914 non-null  object 
 5   transactionAmount         641914 non-null  float64
 6   merchantName              641914 non-null  object 
 7   acqCountry                641914 non-null  object 
 8   merchantCountryCode       641914 non-null  object 
 9   posEntryMode              641914 non-null  float64
 10  posConditionCode          641914 non-null  float64
 11  merchantCategoryCode      641914 non-null  object 
 12  currentExpDate            641914 non-null  object 
 13  accountOpenDate           641914 non-null  o

## Summary Statistics for Numeric Columns

1. What are the min, max, mean, median, and standard deviation of:
    - transactionAmount
    - availableMoney
    - creditLimit
    - currentBalance

2. Are there outliers or unusually high/low values

In [8]:
# Question 1 
df1[['creditLimit', 'availableMoney', 'transactionAmount', 'currentBalance']].describe()



Unnamed: 0,creditLimit,availableMoney,transactionAmount,currentBalance
count,641914.0,641914.0,641914.0,641914.0
mean,10697.210608,6652.828573,135.162497,4044.382035
std,11460.359133,9227.132275,147.053302,5945.510224
min,250.0,-1244.93,0.0,0.0
25%,5000.0,1114.97,32.32,502.4425
50%,7500.0,3578.165,85.8,2151.86
75%,15000.0,8169.185,189.03,5005.89
max,50000.0,50000.0,1825.25,47496.5


Insights gained
1. Based on Std
    - creditLimit : High variation. Some customers have very small limits (250) while others go up to 50000.
    - availableMoney : Wide spread, including negative balances. Suggest different usuage patterns- some maxing out cards, others preserving credit.
    - transactionAmount: Most people spend small amounts , but a few spend a lot in on ego (1825). So the spending pattern is not the same for everyone.
    - currentBalance : Again, wide variablility. Some accounts carry very low(0) while some have very high balances (47496.5)

2. Based on other statistics
    - Most customers have credit limits below 7500, with 75% under 15000. While few premium accounts with 50,000 credit limits.
    - Some accounts appear to be overdrawn (negative values), while 75% have available money below 8000
    - 75% of transactions are below 189 and likely to be everyday purchases. Some with high value like 1825 in one go.
    - Some have large current balance (47,497) while some have zero balance, average is 4,044.

## Exploring Categorical Columns

3. What are the unique values in:
    - merchantCategoryCode
    - transactionType
    - acqCountry
    - merchantCountryCode
    - merchantName
2. What are the most frequet values in those columns?

In [14]:
# unique method for seeing the list of unique values
print("merchantCategoryCode: ", df1['merchantCategoryCode'].unique())
print()
print("transactionType", df1['transactionType'].unique())
print()
print("acqCountry", df1['acqCountry'].unique())
print()
print("merchantName", df1['merchantName'].unique())
print()
print("merchantCountryCode", df1['merchantCountryCode'].unique())
print()
print("posEntryMode",df1['posEntryMode'].unique())
print()
print("posConditionCode", df1['posConditionCode'].unique()) 


# nunique method for count value of unique values
# Print number of unique values for each
print("merchantCategoryCode:", df1['merchantCategoryCode'].nunique())
print("transactionType:", df1['transactionType'].nunique())
print("acqCountry:", df1['acqCountry'].nunique())
print("merchantCountryCode:", df1['merchantCountryCode'].nunique())
print("merchantName:", df1['merchantName'].nunique())
print('posEntryMode',df1['posEntryMode'].nunique())
print('posConditionCode',df1['posConditionCode'].nunique())

      


merchantCategoryCode:  ['rideshare' 'online_gifts' 'personal care' 'fastfood' 'entertainment'
 'online_subscriptions' 'mobileapps' 'fuel' 'food' 'online_retail'
 'airline' 'hotels' 'food_delivery' 'cable/phone' 'subscriptions' 'auto'
 'gym' 'furniture' 'health']

transactionType ['PURCHASE' 'ADDRESS_VERIFICATION' 'REVERSAL']

acqCountry ['US' 'PR' 'MEX' 'CAN']

merchantName ['Lyft' 'Uber' 'Fresh eCards' ... 'Runners #332755' 'Curves #440052'
 'Virgin #218063']

merchantCountryCode ['US' 'PR' 'MEX' 'CAN']

posEntryMode [ 5.  9.  2. 90. 80.]

posConditionCode [ 1.  8. 99.]
merchantCategoryCode: 19
transactionType: 3
acqCountry: 4
merchantCountryCode: 4
merchantName: 2493
posEntryMode 5
posConditionCode 3


## Understanding variable (isFraud)

5. How many transactions are marked as fraud vs not fraud?
6. What is the percentage of fradulent transactions?

In [25]:
#sorting by trasnaction time
sorted_df = df1.sort_values(by='transactionDateTime', ascending=False)

# print(sorted_df.head())

#fraud_transactions = sorted_df[sorted_df['isFraud']].value_counts() # this gives dataframe rows
fraud_transactions = sorted_df['isFraud'].value_counts()
print(fraud_transactions)

#percentage of fradulent transactions:
percentage_of_fraud = (fraud_transactions[1]/(fraud_transactions[0]+fraud_transactions[1]))*100
print(percentage_of_fraud)


isFraud
False    630612
True      11302
Name: count, dtype: int64
1.7606719903289225


  percentage_of_fraud = (fraud_transactions[1]/(fraud_transactions[0]+fraud_transactions[1]))*100


## Customer ID and Account Information

7. How many unique customers are there, and how many accounts per customer?
8. Are there customers with multiple accounts showing unusual activity?
9. Which customers have the highest number of transactions?
10. Do certain customers/accounts have a higher rate of fradulent transactions?

In [36]:
#Unique customer and accounts count
# unique_customers= df1['customerId'].unique()
# print(f"Unique customers count: {len(unique_customers)}")
#output: 5000

unique_accounts = df1['accountNumber'].unique()
print(f"unique_accounts count: {len(unique_accounts)}")
#output: 5000

#(df1['customerId'] == df1['accountNumber']).all()
#checked row by row if both are same and found true

#df1 = df1.drop('customerId', axis=1)
#dropped the customerID column
df1.info()


unique_accounts count: 5000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641914 entries, 0 to 641913
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   accountNumber             641914 non-null  int64         
 1   creditLimit               641914 non-null  int64         
 2   availableMoney            641914 non-null  float64       
 3   transactionDateTime       641914 non-null  datetime64[ns]
 4   transactionAmount         641914 non-null  float64       
 5   merchantName              641914 non-null  object        
 6   acqCountry                641914 non-null  object        
 7   merchantCountryCode       641914 non-null  object        
 8   posEntryMode              641914 non-null  float64       
 9   posConditionCode          641914 non-null  float64       
 10  merchantCategoryCode      641914 non-null  object        
 11  currentExpDate            641914 non-

**Account/Customer with highest number of transactions**

In [40]:
# Group by 'accountNumber' and count the number of transactions (using 'transactionAmount' as a proxy)
transaction_frequency = sorted_df.groupby('accountNumber')['transactionAmount'].count().reset_index()

# Rename the count column for clarity
transaction_frequency = transaction_frequency.rename(columns={'transactionAmount': 'transaction_count'})

# Sort by transaction count in descending order
transaction_frequency_sorted = transaction_frequency.sort_values(by='transaction_count', ascending=False)

# Get the maximum frequency
max_frequency = transaction_frequency_sorted['transaction_count'].max()

# Print the sorted transaction frequency and the maximum frequency
print(transaction_frequency_sorted)
print("Maximum frequency of transactions:", max_frequency)


      accountNumber  transaction_count
1205      318001076              10034
1978      456044564               8382
3945      812328116               5494
4101      838085703               5129
766       239875038               4705
...             ...                ...
1703      406722896                  1
4600      924628692                  1
4903      981518869                  1
4901      981286839                  1
4500      908337167                  1

[5000 rows x 2 columns]
Maximum frequency of transactions: 10034


AccountNumber : 318001076 does highest number of transaction in year 2016 i.e. 10,034

**Do certain customers/accounts have a higher rate of fradulent transactions?**

Steps
1. Find all transactions that a particular account has done
2. Among them find how many are fradulent(isFraud = True)
3. Then calculate percentage of fradulent transactions for that account

In [48]:
total_transactions_frequency= sorted_df.groupby('accountNumber').transactionAmount.count().reset_index()
print(total_transactions_frequency)

fraud_transactions_frequency = sorted_df.groupby("accountNumber")['isFraud'].sum().reset_index().rename(columns ={'isFraud':'fraud_count'})
print(fraud_transactions_frequency)



      accountNumber  transactionAmount
0         100547107                 85
1         100634414                 24
2         100973869                 46
3         101192712                 20
4         101548993                 29
...             ...                ...
4995      999273501                  8
4996      999275549                230
4997      999789077                 72
4998      999984515                 32
4999      999985343                104

[5000 rows x 2 columns]
      accountNumber  fraud_count
0         100547107            1
1         100634414            0
2         100973869            0
3         101192712            0
4         101548993            0
...             ...          ...
4995      999273501            0
4996      999275549            4
4997      999789077            1
4998      999984515            0
4999      999985343            0

[5000 rows x 2 columns]


## posEntryMode and posConditionCode Information

11. What are the most frequent POS entry modes?
12. Is fraud more common in any particular entry mode?
13. Are online (keyed) vs card-present( swiped/tapped) entry modes associated with different transaction amounts?
14. What are the most common POS condition codes?
15. How to different condition codes relate to fraud?
16. Are some condition codes tried to specific merchant categories or regions?

## Date and time features

17. Are there time periods with more fraud?
18. Is fraud more common during weekends or weekday?
19. What is the age of accounts? (opendate to first transaction date)/ (current date - open date)
20. Do newer accounts have more fraud compared to older ones?


**date of Last Address Change**
<br>
21. How many days since the last address change?


**currentExpDate** <br>
22. Are frauds more likely near or after the expiration date?<br>
23. How many cards are close to expiration? Is that related to fraud?