In [95]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
import plotly.express as px 
import scipy
from scipy.stats import zscore


In [96]:
df1 = pd.read_csv("e:/2025/ExtensoData_Internship_Work/Projects/Task_3_EDA/data/cleaned_transactions.csv")
print(df1.info())

# changing datetime columun to datetime format
df1['transactionDateTime'] = pd.to_datetime(df1['transactionDateTime'])
df1["accountOpenDate"] = pd.to_datetime(df1["accountOpenDate"])
df1['currentExpDate'] = pd.to_datetime(df1['currentExpDate'])
df1['dateOfLastAddressChange'] = pd.to_datetime(df1['dateOfLastAddressChange'])
df1['transactionDateTime'].dtype
print(df1.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641914 entries, 0 to 641913
Data columns (total 26 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   accountNumber             641914 non-null  int64  
 1   customerId                641914 non-null  int64  
 2   creditLimit               641914 non-null  int64  
 3   availableMoney            641914 non-null  float64
 4   transactionDateTime       641914 non-null  object 
 5   transactionAmount         641914 non-null  float64
 6   merchantName              641914 non-null  object 
 7   acqCountry                641914 non-null  object 
 8   merchantCountryCode       641914 non-null  object 
 9   posEntryMode              641914 non-null  float64
 10  posConditionCode          641914 non-null  float64
 11  merchantCategoryCode      641914 non-null  object 
 12  currentExpDate            641914 non-null  object 
 13  accountOpenDate           641914 non-null  o

## Summary Statistics for Numeric Columns

1. What are the min, max, mean, median, and standard deviation of:
    - transactionAmount
    - availableMoney
    - creditLimit
    - currentBalance

2. Are there outliers or unusually high/low values

In [97]:
# Question 1 
df1[['creditLimit', 'availableMoney', 'transactionAmount', 'currentBalance']].describe()



Unnamed: 0,creditLimit,availableMoney,transactionAmount,currentBalance
count,641914.0,641914.0,641914.0,641914.0
mean,10697.210608,6652.828573,135.162497,4044.382035
std,11460.359133,9227.132275,147.053302,5945.510224
min,250.0,-1244.93,0.0,0.0
25%,5000.0,1114.97,32.32,502.4425
50%,7500.0,3578.165,85.8,2151.86
75%,15000.0,8169.185,189.03,5005.89
max,50000.0,50000.0,1825.25,47496.5


Insights gained
1. Based on Std
    - creditLimit : High variation. Some customers have very small limits (250) while others go up to 50000.
    - availableMoney : Wide spread, including negative balances. Suggest different usuage patterns- some maxing out cards, others preserving credit.
    - transactionAmount: Most people spend small amounts , but a few spend a lot in on ego (1825). So the spending pattern is not the same for everyone.
    - currentBalance : Again, wide variablility. Some accounts carry very low(0) while some have very high balances (47496.5)

2. Based on other statistics
    - Most customers have credit limits below 7500, with 75% under 15000. While few premium accounts with 50,000 credit limits.
    - Some accounts appear to be overdrawn (negative values), while 75% have available money below 8000
    - 75% of transactions are below 189 and likely to be everyday purchases. Some with high value like 1825 in one go.
    - Some have large current balance (47,497) while some have zero balance, average is 4,044.

## Exploring Categorical Columns

3. What are the unique values in:
    - merchantCategoryCode
    - transactionType
    - acqCountry
    - merchantCountryCode
    - merchantName
2. What are the most frequet values in those columns?

In [98]:
# unique method for seeing the list of unique values
print("merchantCategoryCode: ", df1['merchantCategoryCode'].unique())
print()
print("transactionType", df1['transactionType'].unique())
print()
print("acqCountry", df1['acqCountry'].unique())
print()
print("merchantName", df1['merchantName'].unique())
print()
print("merchantCountryCode", df1['merchantCountryCode'].unique())
print()
print("posEntryMode",df1['posEntryMode'].unique())
print()
print("posConditionCode", df1['posConditionCode'].unique()) 


# nunique method for count value of unique values
# Print number of unique values for each
print("merchantCategoryCode:", df1['merchantCategoryCode'].nunique())
print("transactionType:", df1['transactionType'].nunique())
print("acqCountry:", df1['acqCountry'].nunique())
print("merchantCountryCode:", df1['merchantCountryCode'].nunique())
print("merchantName:", df1['merchantName'].nunique())
print('posEntryMode',df1['posEntryMode'].nunique())
print('posConditionCode',df1['posConditionCode'].nunique())

      


merchantCategoryCode:  ['rideshare' 'online_gifts' 'personal care' 'fastfood' 'entertainment'
 'online_subscriptions' 'mobileapps' 'fuel' 'food' 'online_retail'
 'airline' 'hotels' 'food_delivery' 'cable/phone' 'subscriptions' 'auto'
 'gym' 'furniture' 'health']

transactionType ['PURCHASE' 'ADDRESS_VERIFICATION' 'REVERSAL']

acqCountry ['US' 'PR' 'MEX' 'CAN']

merchantName ['Lyft' 'Uber' 'Fresh eCards' ... 'Runners #332755' 'Curves #440052'
 'Virgin #218063']

merchantCountryCode ['US' 'PR' 'MEX' 'CAN']

posEntryMode [ 5.  9.  2. 90. 80.]

posConditionCode [ 1.  8. 99.]
merchantCategoryCode: 19
transactionType: 3
acqCountry: 4
merchantCountryCode: 4
merchantName: 2493
posEntryMode 5
posConditionCode 3


## Understanding variable (isFraud)

5. How many transactions are marked as fraud vs not fraud?
6. What is the percentage of fradulent transactions?

In [99]:
#sorting by trasnaction time
sorted_df = df1.sort_values(by='transactionDateTime', ascending=False)

# print(sorted_df.head())

#fraud_transactions = sorted_df[sorted_df['isFraud']].value_counts() # this gives dataframe rows
fraud_transactions = sorted_df['isFraud'].value_counts()
print(fraud_transactions)

#percentage of fradulent transactions:
percentage_of_fraud = (fraud_transactions[1]/(fraud_transactions[0]+fraud_transactions[1]))*100
print(percentage_of_fraud)


isFraud
False    630612
True      11302
Name: count, dtype: int64
1.7606719903289225


  percentage_of_fraud = (fraud_transactions[1]/(fraud_transactions[0]+fraud_transactions[1]))*100


## Customer ID and Account Information

7. How many unique customers are there, and how many accounts per customer?
8. Which customers have the highest number of transactions?
9. What is total number of transactions?
10. Calculate fraudlent transaction frequency for each account
11. What is total number of fraudulent transactions?
12. Do certain customers/accounts have a higher rate of fradulent transactions?
13. What is overall fraud rate ?

7. How many unique customers are there, and how many accounts per customer?


In [100]:
#Unique customer and accounts count
# unique_customers= df1['customerId'].unique()
# print(f"Unique customers count: {len(unique_customers)}")
#output: 5000

unique_accounts = df1['accountNumber'].unique()
print(f"unique_accounts count: {len(unique_accounts)}")
#output: 5000

#(df1['customerId'] == df1['accountNumber']).all()
#checked row by row if both are same and found true

#df1 = df1.drop('customerId', axis=1)
#dropped the customerID column
df1.info()


unique_accounts count: 5000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641914 entries, 0 to 641913
Data columns (total 26 columns):
 #   Column                    Non-Null Count   Dtype         
---  ------                    --------------   -----         
 0   accountNumber             641914 non-null  int64         
 1   customerId                641914 non-null  int64         
 2   creditLimit               641914 non-null  int64         
 3   availableMoney            641914 non-null  float64       
 4   transactionDateTime       641914 non-null  datetime64[ns]
 5   transactionAmount         641914 non-null  float64       
 6   merchantName              641914 non-null  object        
 7   acqCountry                641914 non-null  object        
 8   merchantCountryCode       641914 non-null  object        
 9   posEntryMode              641914 non-null  float64       
 10  posConditionCode          641914 non-null  float64       
 11  merchantCategoryCode      641914 non-

8. Which customers have the highest number of transactions?
9. What is total number of transactions?

In [101]:


# This counts all rows per account, including those with transactionAmount == 0
total_transactions_frequency = sorted_df.groupby('accountNumber').size().reset_index(name='transaction_count')
print("Total Transactions Frequency:")
print(total_transactions_frequency)

print("Top 10 Highest transaction count and account number")
print(total_transactions_frequency.sort_values(by='transaction_count', ascending=False).head(10))

print(f"\n🔢 Total number of transactions across all accounts: {total_transactions_frequency['transaction_count'].sum()}")



Total Transactions Frequency:
      accountNumber  transaction_count
0         100547107                 85
1         100634414                 24
2         100973869                 46
3         101192712                 20
4         101548993                 29
...             ...                ...
4995      999273501                  8
4996      999275549                230
4997      999789077                 72
4998      999984515                 32
4999      999985343                104

[5000 rows x 2 columns]
Top 10 Highest transaction count and account number
      accountNumber  transaction_count
1205      318001076              10034
1978      456044564               8382
3945      812328116               5494
4101      838085703               5129
766       239875038               4705
4325      877017103               4435
994       278064853               4227
1406      353215513               3756
1185      314506271               3410
4555      917216469               3

AccountNumber : 318001076 does highest number of transaction in year 2016 i.e. 10,034

9. Calculate fraudlent transaction frequency for each account
10. What is total number of fraudulent transactions?



In [102]:

# Fraud transactions frequency
fraud_transactions_frequency = (
    sorted_df.groupby("accountNumber")['isFraud']
    .sum()
    .reset_index()
    .rename(columns={'isFraud': 'fraud_count'})
)
print(fraud_transactions_frequency)

# Now sort the DataFrame by the newly named column
print("Top 10 Highest fraudulent transaction count and account number")
print(fraud_transactions_frequency.sort_values(by='fraud_count', ascending=False).head(10))

print(f"🚨 Total number of fraudulent transactions across all accounts: {fraud_transactions_frequency['fraud_count'].sum()}")




      accountNumber  fraud_count
0         100547107            1
1         100634414            0
2         100973869            0
3         101192712            0
4         101548993            0
...             ...          ...
4995      999273501            0
4996      999275549            4
4997      999789077            1
4998      999984515            0
4999      999985343            0

[5000 rows x 2 columns]
Top 10 Highest fraudulent transaction count and account number
      accountNumber  fraud_count
1165      311710839          302
1205      318001076          291
1406      353215513          205
840       251625315          142
4014      822203001          118
1978      456044564           95
3405      717128593           81
4325      877017103           78
512       200778999           76
4629      931013168           74
🚨 Total number of fraudulent transactions across all accounts: 11302


11. What is fraud rate per account?
12. Do certain customers/accounts have a higher rate of fradulent transactions?
13. What is overall fraud rate ?

In [103]:
# Merge the two DataFrames on 'accountNumber'
merged_data = pd.merge(total_transactions_frequency, fraud_transactions_frequency, on='accountNumber', how='left')

# Calculate the percentage of fraudulent transactions per account
merged_data['fraud_percentage'] = (merged_data['fraud_count'] / merged_data['transaction_count']) * 100

# Handling missing fraud counts: Replace NaN with 0 (if an account has no fraud)
merged_data['fraud_percentage'] = merged_data['fraud_percentage'].fillna(0)

# Print the percentage of fraudulent transactions for the top 10 accounts, along with accountNumber
print("Top 10 Accounts with highest fraud percentage and their fraud percentage")
print(merged_data[['accountNumber', 'fraud_percentage']].sort_values(by='fraud_percentage', ascending=False).head(10))

# ✅ Total number of transactions in the entire dataset
total_transactions = total_transactions_frequency['transaction_count'].sum()

# ✅ Total number of fraudulent transactions
fraud_transactions_frequency['fraud_count'] = fraud_transactions_frequency['fraud_count'].fillna(0)
total_fraud_transactions = fraud_transactions_frequency['fraud_count'].sum()
print(f"🚨 Total number of fraudulent transactions across all accounts: {int(total_fraud_transactions)}")

overall_fraud_rate = (total_fraud_transactions / total_transactions) * 100
print(f"⚠️ Overall fraud rate in the dataset: {overall_fraud_rate:.2f}%")

print(merged_data.info())



Top 10 Accounts with highest fraud percentage and their fraud percentage
      accountNumber  fraud_percentage
4901      981286839        100.000000
2969      638423733         50.000000
3333      704819779         42.682927
2262      509442666         33.333333
2912      625998940         33.333333
4670      938310147         31.606218
310       163457367         31.578947
1340      340949807         30.769231
721       234537883         26.595745
4626      930431098         25.352113
🚨 Total number of fraudulent transactions across all accounts: 11302
⚠️ Overall fraud rate in the dataset: 1.76%
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   accountNumber      5000 non-null   int64  
 1   transaction_count  5000 non-null   int64  
 2   fraud_count        5000 non-null   int64  
 3   fraud_percentage   5000 non-null   float64
dt

Insights
1. Highest number of transaction is 10,034 by account number 318001076.
2. Highest number of fraudulent transaction is 302 by account number 311710839.
3. Fraud_percentage is 100% for account 981286839 , and 50% for account 638423733.
4. Total number of transactions acroll all accounts: 641914
5. Total number of fraudulent transactions across all accounts: 11302
6. Overall fraud rate is 1.76%.

14. Normally after how long time of opening account the address is changed?
15. From today's date it we see which are the oldest and newest accounts
16. In which day maximum accounts are opened?
17. If enteredCVV and cardCVV mismatches, keep those accounts separate (F1)
if expirationDateKeyInMatch is also False, keep those accounts separate (F2)

So, if card present is false, also falls in F1 and F2 then they can be fraudulent 

18. Whose customers card expiration date is near?


In [104]:
# List of columns related to customer information
customer_columns = [
    'accountNumber',
    'customerId',
    'creditLimit',
    'availableMoney',
    'cardCVV',
    'enteredCVV',
    'cardLast4Digits',
    'currentExpDate',
    'accountOpenDate',
    'dateOfLastAddressChange',
    'currentBalance',
    'expirationDateKeyInMatch',
    'cardPresent'
]

# # Create the customer DataFrame
customer_df = df1[customer_columns].drop_duplicates(subset='accountNumber').reset_index(drop=True)


# Preview the DataFrame
print("Customer DataFrame:")
print(customer_df.head())

print(df1.info())


Customer DataFrame:
   accountNumber  customerId  creditLimit  availableMoney  cardCVV  \
0      733493772   733493772         5000          5000.0      492   
1      578754168   578754168         5000          5000.0      413   
2      664577296   664577296         7500          7500.0      741   
3      550655231   550655231          250           250.0      631   
4      965952336   965952336         7500          7500.0      258   

   enteredCVV  cardLast4Digits currentExpDate accountOpenDate  \
0         492             9184     2020-01-04      2014-08-03   
1         413             5279     2026-01-10      2015-10-21   
2         741             4807     2026-01-08      2015-09-23   
3         631             9626     2028-01-07      2013-03-15   
4         258             1562     2025-01-02      2013-01-11   

  dateOfLastAddressChange  currentBalance  expirationDateKeyInMatch  \
0              2014-08-03             0.0                     False   
1              2015-10-21 

14. Normally after how long time of opening account the address is changed?

In [105]:
customer_df['add_change_gap'] = (customer_df['dateOfLastAddressChange']- customer_df['accountOpenDate']).dt.days
print(customer_df[['accountNumber','add_change_gap']].sort_values(by='add_change_gap', ascending=False).reset_index(drop=True).head(10))

average_gap = customer_df['add_change_gap'].mean()
print(f"\nOverall average address change gap (in days): {average_gap: .2f}")

print("After opening account how many customer have changed the address")
changed_after_opening = customer_df[customer_df['add_change_gap']>0]

print(f"\nNumber of customers who changed the address after opening account: {changed_after_opening.shape[0]} ")
print(changed_after_opening[['accountNumber','add_change_gap']].sort_values(by='add_change_gap',ascending=True).reset_index(drop=True))

   accountNumber  add_change_gap
0      424347107            2297
1      876565936            2170
2      550842881            1654
3      936339359             902
4      502168963             561
5      832864654             550
6      212076667             545
7      205263030             529
8      237591967             473
9      646461105             465

Overall average address change gap (in days):  2.47
After opening account how many customer have changed the address

Number of customers who changed the address after opening account: 21 
    accountNumber  add_change_gap
0       280983864              55
1       617179522              59
2       247722846              80
3       353484467             189
4       781053864             212
5       917661867             225
6       656091817             249
7       727407779             260
8       975854164             274
9       626399878             274
10      607003709             310
11      646461105             465
12   

15. From today's date it we see which are the oldest and newest accounts

In [106]:
#customer_df = customer_df.drop(columns=['accountAge'])
today = pd.Timestamp.today()
customer_df['accountAge'] = (today - customer_df['accountOpenDate']).dt.days

print("\nTop 5 oldest accounts:")
print(customer_df[['accountNumber','accountOpenDate','accountAge']]
      .sort_values(by='accountAge', ascending=False)
      .reset_index(drop=True)
      .head())

print("\nTop 5 new accounts")
print(customer_df[['accountNumber','accountOpenDate','accountAge']]
      .sort_values(by='accountAge',ascending=True)
      .reset_index(drop=True)
      .head())

#Oldest account
oldest_account = customer_df.loc[customer_df['accountAge'].idxmax()]
print(f"\n The oldest account is:\nAccount Number: {oldest_account['accountNumber']},"
      f"Opened on: {oldest_account['accountOpenDate'].date()} ,"
      f" Age: {oldest_account['accountAge']} days ")

#finding the newest account

newest_account = customer_df.loc[customer_df['accountAge'].idxmin()]

print(f"\n The newest account is:\nAccount Number: {newest_account['accountNumber']},"
      f"Opened on: {newest_account['accountOpenDate'].date()} ,"
      f" Age: {newest_account['accountAge']} days ")


Top 5 oldest accounts:
   accountNumber accountOpenDate  accountAge
0      789836144      1985-12-25       14373
1      367184291      2000-09-29        8981
2      414475732      2000-11-16        8933
3      347596074      2002-08-15        8296
4      719425571      2002-10-20        8230

Top 5 new accounts
   accountNumber accountOpenDate  accountAge
0      890796581      2015-12-31        3410
1      605704714      2015-12-31        3410
2      731926026      2015-12-31        3410
3      477081008      2015-12-31        3410
4      783343390      2015-12-31        3410

 The oldest account is:
Account Number: 789836144,Opened on: 1985-12-25 , Age: 14373 days 

 The newest account is:
Account Number: 731926026,Opened on: 2015-12-31 , Age: 3410 days 


16. In which day maximum accounts are opened?

In [107]:
account_open_day_counts = customer_df.groupby('accountOpenDate').size()

max_open_day = account_open_day_counts.idxmax()
max_open_count = account_open_day_counts.max()

print("\n Top 5 days with most account openings:")
print(account_open_day_counts.sort_values(ascending=False).head(50))

print(f"The maximum number of accounts were opened on {max_open_day.date()},with {max_open_count} accounts.")




 Top 5 days with most account openings:
accountOpenDate
2015-11-21    13
2015-12-26    13
2015-11-01    13
2015-09-22    12
2015-05-08    12
2015-07-26    12
2015-11-05    12
2015-01-25    11
2015-12-09    11
2015-11-03    11
2015-05-25    11
2015-02-11    11
2015-11-22    11
2015-03-02    11
2015-11-27    10
2015-06-21    10
2014-04-12    10
2015-12-22    10
2015-09-16    10
2015-10-01    10
2015-10-17    10
2015-08-19     9
2015-09-14     9
2015-08-12     9
2015-11-25     9
2015-07-15     9
2015-07-21     9
2015-07-18     9
2015-07-08     9
2015-12-18     9
2015-12-21     9
2015-06-03     9
2015-06-10     9
2015-08-15     9
2015-08-03     9
2015-07-23     9
2015-08-23     9
2015-11-19     9
2013-09-25     9
2015-07-24     9
2015-05-03     9
2015-10-04     9
2015-10-15     9
2014-11-26     8
2014-11-30     8
2015-09-24     8
2015-01-15     8
2015-08-26     8
2015-08-27     8
2015-01-14     8
dtype: int64
The maximum number of accounts were opened on 2015-11-01,with 13 accounts.


17. Check mismatches between entered CVV and actual CVV and calculate mismatch_percentage



In [108]:
# Step 1: Create a new column to mark mismatches between entered and actual CVV
sorted_df['cvv_mismatch'] = sorted_df['enteredCVV'] != sorted_df['cardCVV']

print(sorted_df.groupby('accountNumber')['cvv_mismatch'].sum().sort_values(ascending=False))

# Step 2: Group by account and calculate total transactions and mismatches
fraudulent_df = sorted_df.groupby('accountNumber').agg(
    total_transactions=('cvv_mismatch', 'count'),
    mismatch_count=('cvv_mismatch', 'sum')
).reset_index()

# Step 3: Calculate percentage of mismatches
fraudulent_df['mismatch_percentage'] = (fraudulent_df['mismatch_count'] / fraudulent_df['total_transactions']) * 100



#fradulent_df only contains those accounts whose cvv mismatch_percentage is greater than 0

fraudulent_df = fraudulent_df[fraudulent_df['mismatch_percentage']>0]

print(fraudulent_df.sort_values(by='mismatch_percentage',ascending=False).head(10))

# Filter accounts with more than 10% mismatch, then sort by mismatch percentage
print(fraudulent_df[fraudulent_df['mismatch_percentage'] > 10].sort_values(by='mismatch_percentage', ascending=False).head(40).reset_index(drop=True))








accountNumber
318001076    83
456044564    72
812328116    59
838085703    54
877017103    54
             ..
101970909     0
102085969     0
102307135     0
102537526     0
102635965     0
Name: cvv_mismatch, Length: 5000, dtype: int64
      accountNumber  total_transactions  mismatch_count  mismatch_percentage
4387      890964842                  23              22            95.652174
752       238223440                  10               9            90.000000
3281      697174743                  16              13            81.250000
4344      880748396                  14              11            78.571429
66        116649028                   4               3            75.000000
1244      324884038                  13               8            61.538462
3400      716448276                  37              22            59.459459
2928      628545009                  23              13            56.521739
4045      827592181                  50              27            54.

We checked for enteredCVV and  cardCVV mismatch, first we counted the total transactions number and then mismatch count for each account number and then grouped those accounts whose mismatch_percentage is greater than 0 in a dataframe fradulent_df.

In fradulent_df one account with 95.65% mismatch rate, similary 39 accounts with mismatch rate above 10%.



CVV Mismatch and isFraud comparison

Checking expirationDateKeyInMatch

In [109]:

print(sorted_df['expirationDateKeyInMatch'].unique())
#all values have True Value


[False  True]


18. Now check cardpresent = False case

In [110]:
# Step 1: Check distinct values in 'cardPresent' column
print(sorted_df['cardPresent'].unique())
print(sorted_df['cardPresent'].value_counts(dropna=False))

# Step 2: Filter data for card-absent transactions
card_absent_df = sorted_df[sorted_df['cardPresent'] == False]

# Step 3: Group and compute mismatch stats for card-absent transactions
fraudulent_card_absent_df = card_absent_df.groupby('accountNumber').agg(
    total_transactions_card_absent=('cvv_mismatch', 'count'),
    cvv_mismatch_count_card_absent=('cvv_mismatch', 'sum')
).reset_index()

# Step 4: Calculate mismatch percentage
fraudulent_card_absent_df['cvv_mismatch_percentage_card_absent'] = (
    fraudulent_card_absent_df['cvv_mismatch_count_card_absent'] / fraudulent_card_absent_df['total_transactions_card_absent']
) * 100


total_mismatches_while_card_absent = card_absent_df['cvv_mismatch'].sum()
print("Total CVV mismatches (card absent):", total_mismatches_while_card_absent)

# Step 6: Calculate totals
total_transactions = fraudulent_card_absent_df['total_transactions_card_absent'].sum()
total_mismatches = fraudulent_card_absent_df['cvv_mismatch_count_card_absent'].sum()

# Step 7: Calculate overall percentage
overall_mismatch_percentage = (total_mismatches / total_transactions) * 100

# Step 8: Create a summary row as a DataFrame
summary_row = pd.DataFrame({
    'accountNumber': ['TOTAL'],
    'total_transactions_card_absent': [total_transactions],
    'cvv_mismatch_count_card_absent': [total_mismatches],
    'cvv_mismatch_percentage_card_absent': [overall_mismatch_percentage]
})

# Step 9: Append the summary row to the original DataFrame
fraudulent_card_absent_df = pd.concat([fraudulent_card_absent_df, summary_row], ignore_index=True)

# # Step 10: Print the final DataFrame
# print(fraudulent_df1)

# print("Top 20 accounts with card absent and high cvv mismatch rate")
# print(fraudulent_df1.sort_values(by='cvv_mismatch_percentage_card_absent', ascending=False).head(20).reset_index(drop=True))

print("Creating fraudulent_df2 to store data for cvv_mismatch_percentage_card_absent greater than 10%")

fraudulent_card_absent_df1 = fraudulent_card_absent_df[fraudulent_card_absent_df['cvv_mismatch_percentage_card_absent']>10]
print(fraudulent_card_absent_df1.sort_values(by='cvv_mismatch_percentage_card_absent',ascending=False).reset_index(drop=True))


[ True False]
cardPresent
False    340453
True     301461
Name: count, dtype: int64
Total CVV mismatches (card absent): 3120
Creating fraudulent_df2 to store data for cvv_mismatch_percentage_card_absent greater than 10%
   accountNumber  total_transactions_card_absent  \
0      238223440                               9   
1      443926651                               1   
2      697174743                              14   
3      628545009                              14   
4      773192558                              10   
5      880748396                              13   
6      966761447                               4   
7      116649028                               4   
8      626702583                               7   
9      380988777                               9   
10     369024894                              11   
11     897359425                               2   
12     519440473                              17   
13     560627602                              22   


Total transaction with card not present( card absent) are: 340453
Total trasnaction with card absent and cvvMismatch are: 3120
For this case cvv_mismatch_percentage_card_present > 10 are kept in dataframe: fraudulent_card_absent_df1
There are accounts with 100% mismatch percentage (238223440 - 9/9), (443926651 - 1/1)

19. cardPresent = True case

In [111]:
# Step 1: Filter data for card-present transactions
card_present_df = sorted_df[sorted_df['cardPresent'] == True]

# Step 2: Group and compute CVV mismatch stats for card-present transactions
fraudulent_card_present_df = card_present_df.groupby('accountNumber').agg(
    total_transactions_card_present=('cvv_mismatch', 'count'),
    cvv_mismatch_count_card_present=('cvv_mismatch', 'sum')
).reset_index()

# Step 3: Calculate mismatch percentage
fraudulent_card_present_df['cvv_mismatch_percentage_card_present'] = (
    fraudulent_card_present_df['cvv_mismatch_count_card_present'] / fraudulent_card_present_df['total_transactions_card_present']
) * 100

# Step 4: Add summary row (totals)
summary_row_present = pd.DataFrame({
    'accountNumber': ['TOTAL'],
    'total_transactions_card_present': [fraudulent_card_present_df['total_transactions_card_present'].sum()],
    'cvv_mismatch_count_card_present': [fraudulent_card_present_df['cvv_mismatch_count_card_present'].sum()],
    'cvv_mismatch_percentage_card_present': [
        (fraudulent_card_present_df['cvv_mismatch_count_card_present'].sum() / 
         fraudulent_card_present_df['total_transactions_card_present'].sum()) * 100
    ]
})

fraudulent_card_present_df = pd.concat([fraudulent_card_present_df, summary_row_present], ignore_index=True)
print(fraudulent_card_present_df)

# Step 5: Filter accounts with >10% mismatch
fraudulent_card_present_df1 = fraudulent_card_present_df[
    fraudulent_card_present_df['cvv_mismatch_percentage_card_present'] > 10
]

# Display results
print(fraudulent_card_present_df1.sort_values(by='cvv_mismatch_percentage_card_present', ascending=False).reset_index(drop=True).head(10))


     accountNumber  total_transactions_card_present  \
0        100547107                               74   
1        100634414                                6   
2        100973869                                2   
3        101548993                               22   
4        101660233                               32   
...            ...                              ...   
4513     999246377                               29   
4514     999789077                               46   
4515     999984515                               25   
4516     999985343                               58   
4517         TOTAL                           301461   

      cvv_mismatch_count_card_present  cvv_mismatch_percentage_card_present  
0                                   1                              1.351351  
1                                   0                              0.000000  
2                                   0                              0.000000  
3                          

Total transaction with card present are: 301461
Total trasnaction with card present and cvvMismatch are: 2817
For this case cvv_mismatch_percentage_card_present > 10 are kept in dataframe: fraudulent_card_present_df1
There are accounts with 100% mismatch percentage (140105230 - 11/11), (380948187 - 1/1), (386190390 - 1/1)




20. CVV Mismatch and isFraud Comparison

In [112]:
sorted_df.shape
sorted_df.columns
fraud_crosstab = pd.crosstab(
    sorted_df['cvv_mismatch'],
    sorted_df['isFraud'],
    rownames=['CVV Mismatch'],
    colnames=['Is Fraud'],
    margins=True  # adds totals for rows and columns
)

print(fraud_crosstab)





Is Fraud       False   True     All
CVV Mismatch                       
False         624870  11107  635977
True            5742    195    5937
All           630612  11302  641914


Most transactions (635,977 out of 641,914) did not have a CVV mismatch.

CVV mismatch occurred in only ~0.92% of transactions (5937 / 641914).

Fraud rate when CVV matched:

11,107 / 635,977 ≈ 1.75%

Fraud rate when CVV mismatched:

195 / 5,937 ≈ 3.28%



## posEntryMode and posConditionCode Information

11. What are the most frequent POS entry modes?
12. Is fraud more common in any particular entry mode?
13. Are online (keyed) vs card-present( swiped/tapped) entry modes associated with different transaction amounts?
14. What are the most common POS condition codes?
15. How to different condition codes relate to fraud?
16. Are some condition codes tried to specific merchant categories or regions?

## Date and time features

17. Are there time periods with more fraud?
18. Is fraud more common during weekends or weekday?
19. What is the age of accounts? (opendate to first transaction date)/ (current date - open date)
20. Do newer accounts have more fraud compared to older ones?


**date of Last Address Change**
<br>
21. How many days since the last address change?


**currentExpDate** <br>
22. Are frauds more likely near or after the expiration date?<br>
23. How many cards are close to expiration? Is that related to fraud?