Some initial data exploration to get a sense of the customer behavior dataset and crypto market data from API

In [2]:
import requests
import dotenv
import json
import pandas as pd
import os

dotenv.load_dotenv()
coingecko_api_key = dotenv.get_key(dotenv.find_dotenv(), "COINGECKO_API_KEY")

shopping_behavior_attributes = [
    "Customer ID",
    "Age",
    "Gender",
    "Item Purchased",
    "Category",
    "Purchase Amount (USD)",
    "Location",
    "Size",
    "Color",
    "Season",
    "Review Rating",
    "Subscription Status",
    "Shipping Type",
    "Discount Applied",
    "Promo Code Used",
    "Previous Purchases",
    "Payment Method",
    "Frequency of Purchases",
]

shopping_behavior_numerical_attributes = [
    "Customer ID",
    "Age",
    "Purchase Amount (USD)",
    "Review Rating",
    "Previous Purchases",
]

shopping_behavior_categorical_attributes = [
    "Gender",
    "Item Purchased",
    "Category",
    "Location",
    "Size",
    "Color",
    "Season",
    "Subscription Status",
    "Shipping Type",
    "Discount Applied",
    "Promo Code Used",
    "Payment Method",
    "Frequency of Purchases",
]

In [3]:
def fetch_data(route):
    url = f"https://api.coingecko.com/api/v3/{route}?key={coingecko_api_key}"
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

In [4]:
shopping_behavior_df = pd.read_csv("../data/shopping_behavior_updated.csv")
shopping_behavior_df.describe()

Unnamed: 0,Customer ID,Age,Purchase Amount (USD),Review Rating,Previous Purchases
count,3900.0,3900.0,3900.0,3900.0,3900.0
mean,1950.5,44.068462,59.764359,3.749949,25.351538
std,1125.977353,15.207589,23.685392,0.716223,14.447125
min,1.0,18.0,20.0,2.5,1.0
25%,975.75,31.0,39.0,3.1,13.0
50%,1950.5,44.0,60.0,3.7,25.0
75%,2925.25,57.0,81.0,4.4,38.0
max,3900.0,70.0,100.0,5.0,50.0


In [5]:
print(shopping_behavior_df.head())

   Customer ID  Age Gender Item Purchased  Category  Purchase Amount (USD)  \
0            1   55   Male         Blouse  Clothing                     53   
1            2   19   Male        Sweater  Clothing                     64   
2            3   50   Male          Jeans  Clothing                     73   
3            4   21   Male        Sandals  Footwear                     90   
4            5   45   Male         Blouse  Clothing                     49   

        Location Size      Color  Season  Review Rating Subscription Status  \
0       Kentucky    L       Gray  Winter            3.1                 Yes   
1          Maine    L     Maroon  Winter            3.1                 Yes   
2  Massachusetts    S     Maroon  Spring            3.1                 Yes   
3   Rhode Island    M     Maroon  Spring            3.5                 Yes   
4         Oregon    M  Turquoise  Spring            2.7                 Yes   

   Shipping Type Discount Applied Promo Code Used  Previ

In [6]:
global_data = fetch_data("global")
categories_data = fetch_data("coins/categories")
trump_data = fetch_data("coins/categories/trump-affiliated-tokens")
rwa_data = fetch_data("coins/categories/real-world-assets-rwa")
solana_meme_data = fetch_data("coins/categories/solana-meme-coins")

timestamp = global_data["data"]["updated_at"]
global_market_cap_24h_change = global_data["data"]["market_cap_change_percentage_24h_usd"]
trump_market_cap_24h_change = trump_data["market_cap_change_24h"]
rwa_market_cap_24h_change = rwa_data["market_cap_change_24h"]
solana_meme_market_cap_24h_change = solana_meme_data["market_cap_change_24h"]

print("Data Timestamp:", timestamp)
print("Global Market Cap 24h Change:", global_market_cap_24h_change)
print("Trump-Affiliated Tokens Market Cap 24h Change:", trump_market_cap_24h_change)
print("Real-World Assets (RWA) Market Cap 24h Change:", rwa_market_cap_24h_change)
print("Solana Meme Coins Market Cap 24h Change:", solana_meme_market_cap_24h_change)

Data Timestamp: 1761472199
Global Market Cap 24h Change: 1.3331326638116858
Trump-Affiliated Tokens Market Cap 24h Change: 2.3648020441711566
Real-World Assets (RWA) Market Cap 24h Change: 13.24633535362804
Solana Meme Coins Market Cap 24h Change: 1.9776822601907058


In [7]:
for col in shopping_behavior_categorical_attributes:
    shopping_behavior_df[col] = shopping_behavior_df[col].astype("category")

def create_social_generations(age):
    if age <= 28:
        return 'Gen Z'
    elif age <= 44:
        return 'Millennial'
    elif age <= 60:
        return 'Gen X'
    elif age <= 79:
        return 'Baby Boomer'
    else:
        return 'Silent Generation'

shopping_behavior_df['Social Generation'] = shopping_behavior_df['Age'].apply(create_social_generations)
shopping_behavior_df['Social Generation'] = shopping_behavior_df['Social Generation'].astype('category')

def purchase_amount_category(amount):
    if amount <= 30:
        return 'Low ($0-30)'
    elif amount <= 50:
        return 'Medium ($31-50)'
    elif amount <= 80:
        return 'High ($51-80)'
    else:
        return 'Very High ($81+)'

shopping_behavior_df['Purchase Category'] = shopping_behavior_df['Purchase Amount (USD)'].apply(purchase_amount_category)
shopping_behavior_df['Purchase Category'] = shopping_behavior_df['Purchase Category'].astype('category')

def state_is_blue(state):
    blue_states = [
        'California', 'Colorado', 'Connecticut', 'Delaware', 'Hawaii', 'Illinois',
        'Maine', 'Maryland', 'Massachusetts', 'Minnesota', 'New Hampshire',
        'New Jersey', 'New Mexico', 'New York', 'Oregon', 'Rhode Island',
        'Vermont', 'Virginia', 'Washington'
    ]

    red_states =    [
        'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'Florida', 'Georgia',
        'Idaho', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
        'Michigan', 'Mississippi', 'Missouri', 'Montana', 'Nebraska',
        'Nevada', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma',
        'Pennsylvania', 'South Carolina', 'South Dakota', 'Tennessee',
        'Texas', 'Utah', 'West Virginia', 'Wisconsin', 'Wyoming'
    ]

    if state in blue_states:
        return 'Blue'
    elif state in red_states:
        return 'Red'
    else:
        return 'Other'

shopping_behavior_df['State Color'] = shopping_behavior_df['Location'].apply(state_is_blue)
shopping_behavior_df['State Color'] = shopping_behavior_df['State Color'].astype('category')

print(shopping_behavior_df.dtypes)
print(shopping_behavior_df.head())
print(shopping_behavior_df['Social Generation'].value_counts())
print(shopping_behavior_df['Purchase Category'].value_counts())
print(shopping_behavior_df['State Color'].value_counts())

Customer ID                  int64
Age                          int64
Gender                    category
Item Purchased            category
Category                  category
Purchase Amount (USD)        int64
Location                  category
Size                      category
Color                     category
Season                    category
Review Rating              float64
Subscription Status       category
Shipping Type             category
Discount Applied          category
Promo Code Used           category
Previous Purchases           int64
Payment Method            category
Frequency of Purchases    category
Social Generation         category
Purchase Category         category
State Color               category
dtype: object
   Customer ID  Age Gender Item Purchased  Category  Purchase Amount (USD)  \
0            1   55   Male         Blouse  Clothing                     53   
1            2   19   Male        Sweater  Clothing                     64   
2            3   

Feature testing:

In [8]:
cat = shopping_behavior_df.groupby("Category")["Purchase Amount (USD)"].sum()
cat = cat.loc[["Clothing", "Accessories"]]
n_customers = shopping_behavior_df["Customer ID"].nunique()
cat = cat.apply(lambda x: x / n_customers)
print(cat)

Category
Clothing       26.734359
Accessories    19.025641
Name: Purchase Amount (USD), dtype: float64


  cat = shopping_behavior_df.groupby("Category")["Purchase Amount (USD)"].sum()


In [9]:
cat = shopping_behavior_df.groupby("Purchase Category")["Purchase Amount (USD)"].sum()
cat = cat.apply(lambda x: x / n_customers)
print(cat)

Purchase Category
High ($51-80)       23.627179
Low ($0-30)          3.534103
Medium ($31-50)      9.894872
Very High ($81+)    22.708205
Name: Purchase Amount (USD), dtype: float64


  cat = shopping_behavior_df.groupby("Purchase Category")["Purchase Amount (USD)"].sum()


In [22]:
shopping_behavior_df.columns.tolist()

['Customer ID',
 'Age',
 'Gender',
 'Item Purchased',
 'Category',
 'Purchase Amount (USD)',
 'Location',
 'Size',
 'Color',
 'Season',
 'Review Rating',
 'Subscription Status',
 'Shipping Type',
 'Discount Applied',
 'Promo Code Used',
 'Previous Purchases',
 'Payment Method',
 'Frequency of Purchases',
 'Social Generation',
 'Purchase Category',
 'State Color']