### Data Science


In [1]:
pip install pandas numpy matplotlib seaborn scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [7]:
#load the simulated dataset
df = pd.read_csv('simulated_giftcard_data.csv.txt')

In [9]:
# take a peak at the first 5 rows of the data
df.head()

Unnamed: 0,transaction_id,user_id,card_type,amount_usd,rate_ngn,timestamp,is_fraud
0,1,39,Amazon,322.34,531.38,2025-01-01 00:00:00,0
1,2,29,Steam,,518.47,2025-01-01 01:00:00,0
2,3,15,Amazon,488.17,466.26,2025-01-01 02:00:00,1
3,4,43,Amazon,262.99,541.09,2025-01-01 03:00:00,0
4,5,8,iTunes,168.25,532.25,2025-01-01 04:00:00,0


In [12]:
# check the shape of the data
print("Shape:", df.shape)

Shape: (200, 7)


In [14]:
# check the columns in the dataset
df.columns.tolist()

['transaction_id',
 'user_id',
 'card_type',
 'amount_usd',
 'rate_ngn',
 'timestamp',
 'is_fraud']

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transaction_id  200 non-null    int64  
 1   user_id         200 non-null    int64  
 2   card_type       200 non-null    object 
 3   amount_usd      190 non-null    float64
 4   rate_ngn        195 non-null    float64
 5   timestamp       200 non-null    object 
 6   is_fraud        200 non-null    int64  
dtypes: float64(2), int64(3), object(2)
memory usage: 11.1+ KB


In [16]:
df.describe()

Unnamed: 0,transaction_id,user_id,amount_usd,rate_ngn,is_fraud
count,200.0,200.0,190.0,195.0,200.0
mean,100.5,24.965,252.104737,497.478154,0.025
std,57.879185,13.999777,149.899295,28.700555,0.156517
min,1.0,1.0,15.31,451.45,0.0
25%,50.75,13.0,122.5175,472.08,0.0
50%,100.5,25.5,254.405,496.08,0.0
75%,150.25,37.0,378.4775,520.87,0.0
max,200.0,49.0,496.55,549.97,1.0


In [19]:
cardtype_dict = df['card_type'].value_counts()

Amazon     49
Steam      46
Google     41
iTunes     35
Netflix    29
Name: card_type, dtype: int64

In [24]:
high_value = df[df['amount_usd']>200]
print("High-Value transactions:\n", high_value.head())

High-Value transactions:
    transaction_id  user_id card_type  amount_usd  rate_ngn  \
0               1       39    Amazon      322.34    531.38   
2               3       15    Amazon      488.17    466.26   
3               4       43    Amazon      262.99    541.09   
5               6       21     Steam      399.64    544.98   
7               8       19    iTunes      225.10    511.34   

             timestamp  is_fraud  
0  2025-01-01 00:00:00         0  
2  2025-01-01 02:00:00         1  
3  2025-01-01 03:00:00         0  
5  2025-01-01 05:00:00         0  
7  2025-01-01 07:00:00         0  


In [25]:
fraud = df[df['is_fraud']==1]

In [26]:
print(fraud)

     transaction_id  user_id card_type  amount_usd  rate_ngn  \
2                 3       15    Amazon      488.17    466.26   
14               15       24    Amazon       94.91    531.06   
107             108       35    Google      166.46    453.93   
140             141       14   Netflix      191.61    482.75   
144             145       15    iTunes      379.16    517.91   

               timestamp  is_fraud  
2    2025-01-01 02:00:00         1  
14   2025-01-01 14:00:00         1  
107  2025-01-05 11:00:00         1  
140  2025-01-06 20:00:00         1  
144  2025-01-07 00:00:00         1  


# Findings from the data so far

### According to the data, the maximum amount_usd is 496.550000
### The card types and the number of times they appear in the dataset are 
### Amazon     49
### Steam      46
### Google     41
### iTunes     35
### Netflix    29

### The Fraud cases according to the dataset are 
### 2    2025-01-01 02:00:00         1  
### 14   2025-01-01 14:00:00         1  
### 107  2025-01-05 11:00:00         1  
### 140  2025-01-06 20:00:00         1  
### 144  2025-01-07 00:00:00         1 


In [29]:
# Checking for missing values
print(df.isnull().sum())

transaction_id     0
user_id            0
card_type          0
amount_usd        10
rate_ngn           5
timestamp          0
is_fraud           0
dtype: int64


In [31]:
df['amount_usd'] = df['amount_usd'].fillna(df['amount_usd'].median())

In [32]:
df['rate_ngn'] = df['rate_ngn'].fillna(df['rate_ngn'].mean())

In [33]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [34]:
df = df.drop_duplicates()

In [36]:
print("After cleaning:\n", df.isnull().sum())

After cleaning:
 transaction_id    0
user_id           0
card_type         0
amount_usd        0
rate_ngn          0
timestamp         0
is_fraud          0
dtype: int64
