<a href="https://colab.research.google.com/github/Aljawharah004/Bank-Analysis/blob/main/datadrip.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
df_transaction = pd.read_csv('transactions_data.csv')
import matplotlib.pyplot as plt
import seaborn as sns

## MCC API

In [6]:
!pip install requests pandas mastercard-oauth1-signer



### Making MCC fetch resilient

In [14]:
import time, json, os, pandas as pd, requests
from tenacity import retry, wait_exponential_jitter, stop_after_attempt
import oauth1.authenticationutils as authenticationutils
from oauth1.oauth_ext import OAuth1RSA

# ==== CONFIG ====
CONSUMER_KEY = "OOPKSjljmCim6R4XEytMtGH8AYRu-wh3GMl1mmnh4285d91f!4cf35e5c1c404a1aaf934d99fb0bdf1f0000000000000000"  # from Mastercard Developer portal
P12_PATH     = "444200750-sandbox.p12"   # the file you uploaded
P12_PASSWORD = "9v3ThM5M4ggmF!RPfGSfHG6v7y!JC"                # set when you downloaded the key
BASE_URL     = "https://sandbox.api.mastercard.com/places"
OUT_PATH     = "mcc_lookup_mastercard.csv"
CACHE_PATH   = "mcc_pages_cache.jsonl"  # page-by-page cache

# OAuth
signing_key = authenticationutils.load_signing_key(P12_PATH, P12_PASSWORD)
oauth = OAuth1RSA(CONSUMER_KEY, signing_key)

# Retry for transient 5xx (503 etc.)
@retry(wait=wait_exponential_jitter(initial=1, max=20), stop=stop_after_attempt(6))
def fetch_page(limit: int, offset: int):
    url = f"{BASE_URL}/merchant-category-codes"
    params = {
        "limit": limit,
        "offset": offset,
        # if backend is touchy, try removing sort entirely:
        # "sort": "+merchantCategoryName"
    }
    r = requests.get(url, params=params, auth=oauth, headers={"Accept": "application/json"}, timeout=30)
    # Raise on HTTP errors (503 included) so tenacity retries
    r.raise_for_status()
    return r.json().get("merchantCategoryCodes", [])

def save_page_cache(items):
    with open(CACHE_PATH, "a", encoding="utf-8") as f:
        for it in items:
            f.write(json.dumps(it, ensure_ascii=False) + "\n")

def load_cache_df():
    if not os.path.exists(CACHE_PATH):
        return pd.DataFrame()
    rows = [json.loads(line) for line in open(CACHE_PATH, "r", encoding="utf-8")]
    return pd.DataFrame(rows)

# Try to resume from cache if any
df_cached = load_cache_df()
seen_count = len(df_cached)
print(f"Cached rows found: {seen_count}")

rows_total = []
if seen_count:
    rows_total.extend(df_cached.to_dict("records"))

# Page through with conservative settings (smaller pages help under load)
limit = 100
offset = 0 if not seen_count else (seen_count // limit) * limit

while True:
    try:
        items = fetch_page(limit, offset)
    except Exception as e:
        # last-ditch: small sleep and one manual retry for this offset
        time.sleep(3)
        try:
            items = fetch_page(limit, offset)
        except Exception as e2:
            print(f"Stopping at offset={offset} due to repeated errors: {e2}")
            break

    if not items:
        break

    save_page_cache(items)
    rows_total.extend(items)

    print(f"Fetched {len(items)} at offset {offset} (total so far: {len(rows_total)})")
    if len(items) < limit:
        break
    offset += limit

# Build DataFrame from all rows (cache + new)
mcc_df = pd.DataFrame(rows_total)
if mcc_df.empty:
    raise RuntimeError("No MCC rows fetched; try again in a few minutes.")

# Normalize column names (adjust if your JSON uses different keys)
mcc_df = mcc_df.rename(columns={
    "merchantCategoryCode": "mcc",
    "merchantCategoryName": "mcc_name",
    "industryCode": "industry_code",
    "industryName": "industry_name"
})
mcc_df["mcc"] = mcc_df["mcc"].astype(str).str.zfill(4)
mcc_df = mcc_df.drop_duplicates(subset=["mcc"])

mcc_df.to_csv(OUT_PATH, index=False)
print(f"Saved {len(mcc_df)} MCC rows to {OUT_PATH}")

Cached rows found: 0
Stopping at offset=0 due to repeated errors: RetryError[<Future at 0x7acd1789de90 state=finished raised HTTPError>]


RuntimeError: No MCC rows fetched; try again in a few minutes.

### Mapping MCC names

In [None]:
# 1) Load transactions & MCC lookup
tx = pd.read_csv("transactions.csv")  # or read_excel(...)
mcc_lookup = pd.read_csv("mcc_lookup_mastercard.csv", dtype={"mcc": str})

# 2) Normalize MCC in tx and merge
tx["mcc_str"] = tx["mcc"].astype(str).str.extract(r"(\d{4})")[0].str.zfill(4)
tx_enriched = tx.merge(mcc_lookup, left_on="mcc_str", right_on="mcc", how="left")

# 3) Bucket for cleaner visuals
def bucketize(name):
    n = (name or "").lower()
    if any(k in n for k in ["grocery", "supermarket", "convenience"]): return "Groceries"
    if any(k in n for k in ["restaurant", "dining", "fast food", "cafe"]): return "Food & Dining"
    if any(k in n for k in ["hotel", "motel", "lodging"]): return "Travel - Lodging"
    if any(k in n for k in ["airline", "airport", "taxi", "transport"]): return "Travel - Transport"
    if any(k in n for k in ["fuel", "gas", "petrol"]): return "Fuel"
    if any(k in n for k in ["clothing", "apparel"]): return "Apparel"
    return "Other"

tx_enriched["mcc_bucket"] = tx_enriched["mcc_name"].apply(bucketize)

tx_enriched.to_csv("transactions_with_mcc_names.csv", index=False)
print("Saved transactions_with_mcc_names.csv")


#### Emergency fallback

In [None]:
# Build a quick local dictionary for the most common MCCs you see (extend as needed)
minimal_map = {
    "5411": "Grocery Stores, Supermarkets",
    "5812": "Eating Places, Restaurants",
    "5814": "Fast Food Restaurants",
    "5541": "Service Stations",
    "5542": "Automated Fuel Dispensers",
    "4111": "Local/Suburban Commuter Transportation",
    "4511": "Airlines, Air Carriers",
    "5311": "Department Stores",
    "5699": "Misc. Apparel & Accessory Stores",
    "5732": "Electronics Stores",
    "5999": "Misc. & Specialty Retail",
    "6011": "ATM Financial Institutions",
    "6012": "Financial Institutions–Merchandise/Services",
    "4814": "Telecommunication Services",
    "4829": "Money Orders—Wire Transfer",
    "4900": "Utilities",
    "7011": "Lodging—Hotels, Motels, Resorts",
    "4112": "Passenger Railways",
    "4121": "Taxis/Limousines",
    "4789": "Transportation Services (Not Elsewhere Classified)"
}

tx = pd.read_csv("transactions.csv")
tx["mcc_str"] = tx["mcc"].astype(str).str.extract(r"(\d{4})")[0].str.zfill(4)
tx["mcc_name"] = tx["mcc_str"].map(minimal_map).fillna("Unknown")
tx.to_csv("transactions_with_mcc_names_partial.csv", index=False)
print("Saved transactions_with_mcc_names_partial.csv (temporary mapping)")
