Import the needed libraries

In [31]:

import pandas as pd
import numpy as np
from datetime import datetime

#### Load the data

In [32]:

file_path = r"C:\Users\ASUS\OneDrive\Desktop\Greedy Game\Task 1\1_txn_data.csv"
df = pd.read_csv(file_path)

### Basic data exploration
I will be leveraging basic data exploration functions like columns.tolist(), head(),info(), describe()

In [33]:

print("Dataset shape:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

Dataset shape: (65936, 13)

Column names:
['adv_id', 'value_in_paise', 'payment_gateway', 'payment_method', 'payment_status', 'created_at', 'app_id', 'from_currency_amount', 'from_currency', 'to_currency_amount', 'to_currency', 'transaction_fee_amount', 'transaction_fee_currency']


In [34]:
# Display first few rows
df.head()

Unnamed: 0,adv_id,value_in_paise,payment_gateway,payment_method,payment_status,created_at,app_id,from_currency_amount,from_currency,to_currency_amount,to_currency,transaction_fee_amount,transaction_fee_currency
0,,3000,RAZORPAY,UPI,PROCESSED,2024-10-18 09:34:14.247708+00:00,sikka,300.0,sikka,30.0,INR,130.0,paise
1,2d3ad81b-a068-460a-9f35-60d7df3c2b78,3000,RAZORPAY,UPI,PROCESSED,2024-10-18 09:34:14.247708+00:00,sikka,300.0,sikka,30.0,INR,130.0,paise
2,3052eb57-f48d-4bb5-9469-bf4e1c71c8a9,3000,RAZORPAY,UPI,PROCESSED,2024-10-18 09:34:14.247708+00:00,sikka,300.0,sikka,30.0,INR,130.0,paise
3,802684aa-0870-4258-9561-aa84f6aa88d8,3000,RAZORPAY,UPI,PROCESSED,2024-10-18 09:34:14.247708+00:00,sikka,300.0,sikka,30.0,INR,130.0,paise
4,92edbd60-5179-47ca-bb6d-122a157a27e0,3000,RAZORPAY,UPI,PROCESSED,2024-10-18 09:34:14.247708+00:00,sikka,300.0,sikka,30.0,INR,130.0,paise


In [35]:
# Check data types and basic info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 65936 entries, 0 to 65935
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   adv_id                    65914 non-null  object 
 1   value_in_paise            65936 non-null  int64  
 2   payment_gateway           65936 non-null  object 
 3   payment_method            65936 non-null  object 
 4   payment_status            65936 non-null  object 
 5   created_at                65936 non-null  object 
 6   app_id                    65936 non-null  object 
 7   from_currency_amount      65936 non-null  float64
 8   from_currency             65936 non-null  object 
 9   to_currency_amount        65936 non-null  float64
 10  to_currency               65936 non-null  object 
 11  transaction_fee_amount    65936 non-null  float64
 12  transaction_fee_currency  58450 non-null  object 
dtypes: float64(3), int64(1), object(9)
memory usage: 6.5+ MB


In [36]:
# Basic statistics for numeric columns
df.describe()

Unnamed: 0,value_in_paise,from_currency_amount,to_currency_amount,transaction_fee_amount
count,65936.0,65936.0,65936.0,65936.0
mean,3616.422728,1777.996668,185.79121,94.414408
std,4188.799537,3266.990251,1345.300481,57.95785
min,500.0,5.0,0.1,0.0
25%,1000.0,50.0,10.0,0.0
50%,3000.0,300.0,30.0,130.0
75%,5000.0,1000.0,50.0,130.0
max,50000.0,50000.0,50000.0,130.0


whilst exploration I understood that, the created_at column does not have datetime datatype, hence to facilitate further EDA, I am converting it to, datetime datatype

In [37]:
# Convert created_at to datetime
df['created_at'] = pd.to_datetime(df['created_at'])

It also makes sense to sort on the basis of date and the specific campaign or client for whom we are facilitating these transactions

In [38]:
# Sort data by adv_id and created_at
df = df.sort_values(['adv_id', 'created_at'])

In [39]:
# Add transaction sequence number for each user
df['transaction_number'] = df.groupby('adv_id').cumcount() + 1

In [40]:
# Check transaction sequence distribution
df['transaction_number'].value_counts().sort_index().head(10)

transaction_number
1.0     28727
2.0      8569
3.0      4033
4.0      2312
5.0      1688
6.0       896
7.0       731
8.0       633
9.0       492
10.0      455
Name: count, dtype: int64

This code finds and counts the first transaction of every user from the full list of transactions. I will be leveraging .copy() function to make a separate copy of this filtered data. The ==1 filters only those rows from the full data where the transaction number is 1.
So, I am keeping only the first transaction of every user

In [41]:
# Get first transactions for each user
first_txns = df[df['transaction_number'] == 1].copy()
print(f"Total users who made first transaction: {len(first_txns)}")

Total users who made first transaction: 28727


I will be repeating the same for the second section whilst changing the logic to ==2

In [42]:
# Get second transactions for each user
second_txns = df[df['transaction_number'] == 2].copy()
print(f"Total users who made second transaction: {len(second_txns)}")

Total users who made second transaction: 8569


In [43]:
# Get second transactions for each user
second_txns = df[df['transaction_number'] == 3].copy()
print(f"Total users who made third transaction: {len(second_txns)}")

Total users who made third transaction: 4033


In [44]:
# Merge first and second transactions
first_second_data = pd.merge(
    first_txns[['adv_id', 'created_at']], 
    second_txns[['adv_id', 'created_at']], 
    on='adv_id', 
    suffixes=('_first', '_second')
)

Once the transactions are merged we can now calculate the days between first and second transactions

In [45]:
# Calculate days between first and second transaction
first_second_data['days_diff'] = (first_second_data['created_at_second'] - 
                                 first_second_data['created_at_first']).dt.days

### Answer Question 1 - Average time to second transaction

In [46]:

avg_days_to_second = first_second_data['days_diff'].mean()
print(f"Average days to second transaction: {avg_days_to_second:.2f}")

Average days to second transaction: 1.50


### Answer Question 2 - Percentage who made second transaction

In [47]:
total_first_users = len(first_txns)
users_with_second = len(second_txns)
second_txn_rate = (users_with_second / total_first_users) * 100
print(f"Percentage of users who made second transaction: {second_txn_rate:.2f}%")

Percentage of users who made second transaction: 14.04%


In [48]:
# To find all transactions after the first one
after_first_txns = df[df['transaction_number'] > 1].copy()
print(f"Total transactions after first: {len(after_first_txns)}")

Total transactions after first: 37187


In [49]:
# Convert value_in_paise to INR for better understanding
after_first_txns['amount_inr'] = after_first_txns['value_in_paise'] / 100

### Answer Question 3 - Average amount after first transaction

In [50]:
avg_amount_after_first = after_first_txns['amount_inr'].mean()
print(f"Average transaction amount after first: {avg_amount_after_first:.2f} INR")

Average transaction amount after first: 45.17 INR


In [51]:
# Cell 20: Summary of all answers
print("-"*50)
print("FINAL ANSWERS:")
print("-"*50)
print(f"1. Average time to second transaction: {avg_days_to_second:.2f} days")
print(f"2. Percentage who made second transaction: {second_txn_rate:.2f}%")
print(f"3. Average amount after first transaction: {avg_amount_after_first:.2f} INR")

--------------------------------------------------
FINAL ANSWERS:
--------------------------------------------------
1. Average time to second transaction: 1.50 days
2. Percentage who made second transaction: 14.04%
3. Average amount after first transaction: 45.17 INR


In [52]:
# The supplementary stats for context
print("\nAdditional Statistics:")
print(f"Total unique users: {df['adv_id'].nunique()}")
print(f"Total transactions: {len(df)}")
print(f"Date range: {df['created_at'].min()} to {df['created_at'].max()}")


Additional Statistics:
Total unique users: 28727
Total transactions: 65936
Date range: 2024-10-16 00:01:41.904262+00:00 to 2024-10-23 23:58:52.890163+00:00


In [53]:
# Transaction frequency by user
user_txn_counts = df['adv_id'].value_counts()
print(f"Users with 1 transaction: {sum(user_txn_counts == 1)}")
print(f"Users with 2 transactions: {sum(user_txn_counts == 2)}")
print(f"Users with 3+ transactions: {sum(user_txn_counts >= 3)}")

Users with 1 transaction: 20158
Users with 2 transactions: 4536
Users with 3+ transactions: 4033


In [54]:
# Average first transaction amount
first_txns['amount_inr'] = first_txns['value_in_paise'] / 100
avg_first_amount = first_txns['amount_inr'].mean()
print(f"Average first transaction amount: {avg_first_amount:.2f} INR")

Average first transaction amount: 24.51 INR


In [55]:
# Distribution of subsequent transaction amounts
print("Distribution of amounts after first transaction:")
print(after_first_txns['amount_inr'].describe())

Distribution of amounts after first transaction:
count    37187.000000
mean        45.166037
std         43.205085
min          5.000000
25%         10.000000
50%         30.000000
75%         50.000000
max        500.000000
Name: amount_inr, dtype: float64


In [56]:
# Payment method analysis
print("\nPayment method distribution:")
print(df['payment_method'].value_counts())


Payment method distribution:
payment_method
UPI                  48173
GIFT_CARD            14149
PAYPAL_ID             2617
TALKTIME               949
LIGHTNING_ADDRESS       44
VENMO_ID                 4
Name: count, dtype: int64


In [57]:
# Payment status analysis
print("\nPayment status distribution:")
print(df['payment_status'].value_counts())


Payment status distribution:
payment_status
PROCESSED     55381
FAILED         8099
PROCESSING     2456
Name: count, dtype: int64


In [58]:
# I am Creating a results dataframe for a wholistic view
results_df = pd.DataFrame({
    'Metric': [
        'Average Days to Second Transaction',
        'Second Transaction Rate (%)',
        'Average Amount After First Transaction (INR)'
    ],
    'Value': [
        f"{avg_days_to_second:.2f}",
        f"{second_txn_rate:.2f}%",
        f"{avg_amount_after_first:.2f}"
    ]
})

results_df

Unnamed: 0,Metric,Value
0,Average Days to Second Transaction,1.50
1,Second Transaction Rate (%),14.04%
2,Average Amount After First Transaction (INR),45.17
