Importing dependencies

In [9]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")
%matplotlib inline

plt.style.use("seaborn-v0_8")
sns.set_palette("husl")

# Make pandas display numbers nicely
pd.options.display.float_format = '{:,.2f}'.format

Loading the data and initial checks

In [10]:
import pandas as pd

# Load the main transaction data file
try:
    df = pd.read_csv("../data/raw/dataset.csv")
except FileNotFoundError:
    print("Please make sure 'dataset.csv' is in your project directory.")
    exit()

# 1. Inspect the first few rows
print("--- 1. Head of the Data ---")
print(df.head())
print(f"Dataset shape: {df.shape[0]:,}rows * {df.columns.size} columns")

# 2. Check the overall structure (rows, columns, data types, missing values)
print("\n--- 2. Data Info ---")
df.info()

# 3. Check for the number of unique customers
# This is a key metric to understand the scale of the customer base.
num_customers = df['CustomerId'].nunique()
print(f"\nTotal number of unique customers: {num_customers}")

--- 1. Head of the Data ---
         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory    ChannelId    Am

Converting TransactionStartTime to datetime and creating local time version for easier analysis

In [11]:
df["TransactionStartTime"]= pd.to_datetime(df["TransactionStartTime"], utc=True)
df["TransactionStartTime_local"]=df["TransactionStartTime"].dt.tz_convert("Africa/Kampala")
df[["TransactionStartTime","TransactionStartTime_local"]].head()

Unnamed: 0,TransactionStartTime,TransactionStartTime_local
0,2018-11-15 02:18:49+00:00,2018-11-15 05:18:49+03:00
1,2018-11-15 02:19:08+00:00,2018-11-15 05:19:08+03:00
2,2018-11-15 02:44:21+00:00,2018-11-15 05:44:21+03:00
3,2018-11-15 03:32:55+00:00,2018-11-15 06:32:55+03:00
4,2018-11-15 03:34:21+00:00,2018-11-15 06:34:21+03:00
