In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
import pyarrow.dataset as ds

# ===== Пути =====
path_tx = "/content/drive/MyDrive/ИТМО/transaction_fraud_data.parquet"
path_fx = "/content/drive/MyDrive/ИТМО/historical-currency-exchange.csv"

# ===== Курсы валют =====
fx = pd.read_csv(path_fx)
fx["date"] = pd.to_datetime(fx["date"], errors="coerce").dt.date
fx_long = fx.melt(id_vars="date", var_name="currency", value_name="rate").dropna(subset=["rate"])
fx_long["currency"] = fx_long["currency"].str.upper()
fx_map = {(d, c): float(r) for d, c, r in fx_long.itertuples(index=False)}

# ===== Аккумуляторы =====
fraud_cnt = rows_cnt = 0
fraud_by_country = Counter()

sum_by_city = defaultdict(float)
cnt_by_city = defaultdict(int)
sum_ff_by_city = defaultdict(float)
cnt_ff_by_city = defaultdict(int)

fr_high_num = fr_high_den = 0

usd_non_fraud, usd_fraud = [], []

sum_group_sizes = 0
n_groups = 0

um_values_by_client = defaultdict(list)

# ===== Какие колонки нам нужны =====
usecolumns = [
    "is_fraud","country","customer_id","timestamp","is_high_risk_vendor",
    "city","amount","vendor_type","currency","last_hour_activity","device"
]

# ===== Чтение по частям через pyarrow.dataset =====
dataset = ds.dataset(path_tx, format="parquet")

# Перебор чанков
for batch in dataset.to_batches(columns=usecolumns, batch_size=1_000_000):
    chunk = batch.to_pandas()

    # Время
    chunk["ts"] = pd.to_datetime(chunk["timestamp"], errors="coerce")
    chunk["h"] = chunk["ts"].dt.floor("h")

    # 1
    fraud_cnt += int(chunk["is_fraud"].sum())
    rows_cnt  += len(chunk)

    # 2
    fraud_by_country.update(chunk.loc[chunk["is_fraud"] == 1, "country"].dropna().astype(str))

    # 3
    sizes = chunk.groupby(["customer_id", "h"]).size()
    sum_group_sizes += int(sizes.sum())
    n_groups += int(sizes.shape[0])

    # 4
    mask_hr = chunk["is_high_risk_vendor"] == True
    fr_high_num += int(chunk.loc[mask_hr, "is_fraud"].sum())
    fr_high_den += int(mask_hr.sum())

    # 5
    g = chunk.groupby("city")["amount"].agg(sum="sum", count="size")
    for city, row in g.iterrows():
        sum_by_city[city] += float(row["sum"])
        cnt_by_city[city] += int(row["count"])

    # 6
    ff = chunk.loc[chunk["vendor_type"] == "fast_food"].groupby("city")["amount"].agg(sum="sum", count="size")
    for city, row in ff.iterrows():
        sum_ff_by_city[city] += float(row["sum"])
        cnt_ff_by_city[city] += int(row["count"])

    # 7–10: перевод в USD
    d = chunk["ts"].dt.date
    rates = np.array([fx_map.get((di, ci), np.nan) for di, ci in zip(d, chunk["currency"].str.upper())], dtype=float)
    valid = ~np.isnan(rates) & (rates != 0.0)
    if valid.any():
        amt_usd = chunk.loc[valid, "amount"].to_numpy(dtype=float) / rates[valid]
        nf = chunk.loc[valid, "is_fraud"] == 0
        fr = chunk.loc[valid, "is_fraud"] == 1
        usd_non_fraud.extend(amt_usd[nf])
        usd_fraud.extend(amt_usd[fr])

    # 11
    sub = chunk[["customer_id", "h", "last_hour_activity"]].dropna().drop_duplicates(["customer_id", "h"])
    for cid, lha in zip(sub["customer_id"], sub["last_hour_activity"]):
        try:
            obj = json.loads(lha) if isinstance(lha, str) else lha
            if isinstance(obj, dict) and "unique_merchants" in obj:
                um_values_by_client[cid].append(int(obj["unique_merchants"]))
        except:
            pass

# ===== Финализация =====
fraud_ratio_rounded = round(np.ceil((fraud_cnt / max(rows_cnt, 1)) * 10) / 10, 1)
top_keys = [c for c, _ in fraud_by_country.most_common(5) if isinstance(c, str) and c]
top5_fraud_countries = ",".join(top_keys)
avg_tx_per_client_hour = round(np.floor((sum_group_sizes / max(n_groups, 1)) * 10) / 10, 1)
fraud_ratio_high_risk = round(np.ceil((fr_high_num / max(fr_high_den, 1)) * 10) / 10, 1)
city_highest_avg = max(sum_by_city, key=lambda k: sum_by_city[k] / max(cnt_by_city[k], 1)) if sum_by_city else None
city_fast_food_highest = max(sum_ff_by_city, key=lambda k: sum_ff_by_city[k] / max(cnt_ff_by_city[k], 1)) if sum_ff_by_city else None
usd_nf = pd.Series(usd_non_fraud, dtype=float).dropna()
usd_fr = pd.Series(usd_fraud, dtype=float).dropna()
mean_usd_non_fraud = int(np.ceil(usd_nf.mean())) if not usd_nf.empty else None
std_usd_non_fraud  = int(np.ceil(usd_nf.std(ddof=0))) if not usd_nf.empty else None
mean_usd_fraud     = int(np.ceil(usd_fr.mean())) if not usd_fr.empty else None
std_usd_fraud      = int(np.ceil(usd_fr.std(ddof=0))) if not usd_fr.empty else None
med_by_client = [np.median(v) for v in um_values_by_client.values() if v]
dangerous_clients_count = int(sum(m > np.quantile(med_by_client, 0.95) for m in med_by_client)) if med_by_client else 0

# ===== Вывод =====
print("1.", fraud_ratio_rounded)
print("2.", top5_fraud_countries)
print("3.", avg_tx_per_client_hour)
print("4.", fraud_ratio_high_risk)
print("5.", city_highest_avg)
print("6.", city_fast_food_highest)
print("7.", mean_usd_non_fraud)
print("8.", std_usd_non_fraud)
print("9.", mean_usd_fraud)
print("10.", std_usd_fraud)
print("11.", dangerous_clients_count)

1. 0.2
2. Russia,Mexico,Brazil,Nigeria,Australia
3. 2.4
4. 0.2
5. Unknown City
6. Unknown City
7. 460
8. 418
9. 875
10. 1350
11. 158


In [None]:
# === настройки ===
GH_USER = "DANIELVSHVL"
GH_REPO = "ITMO_EXAM"
BRANCH  = "feature/data-analysis-answers"   # имя новой ветки

# путь к ТВОЕМУ ноутбуку с ответами 1–11
NB_SRC = "/content/drive/MyDrive/Colab Notebooks/01_answers_1-11.ipynb"

from getpass import getpass
GH_TOKEN = getpass("GitHub token (ввод скрыт): ")

# === код ===
import os, sys, shutil, pathlib, textwrap, subprocess

repo_dir = "/content/ITMO_EXAM"
remote   = f"https://{GH_USER}:{GH_TOKEN}@github.com/{GH_USER}/{GH_REPO}.git"

# начисто клонируем
if os.path.exists(repo_dir):
    shutil.rmtree(repo_dir)
subprocess.run(["git","clone",remote,repo_dir], check=True)

# структура
for p in [
    "data",
    "project1_data_analysis/notebooks",
    "project1_data_analysis/src",
    "project2_chatbot",
    "reports/figures",
]:
    pathlib.Path(f"{repo_dir}/{p}").mkdir(parents=True, exist_ok=True)

# базовые файлы
(open(f"{repo_dir}/.gitignore","w")
 .write("data/*\n*.parquet\n*.csv\n.ipynb_checkpoints/\n.DS_Store\n"))
(open(f"{repo_dir}/requirements.txt","w")
 .write("pandas\nnumpy\npyarrow\n"))
(open(f"{repo_dir}/README.md","w",encoding="utf-8")
 .write(f"# ITMO_EXAM\n\nДва проекта:\n\n- project1_data_analysis — анализ транзакций.\n- project2_chatbot — чат‑бот (WIP).\n\nДанные не коммитятся. Python {sys.version.split()[0]}.\n"))
readme_p1 = """# Project 1 – Data Analysis

- notebooks/01_answers_1-11.ipynb — вычисление метрик и ответов 1–11.
- Данные читаются из Parquet, большие файлы не коммитим.

Как запустить локально:
1) pip install -r ../../requirements.txt
2) jupyter notebook
"""
open(f"{repo_dir}/project1_data_analysis/README.md","w",encoding="utf-8").write(readme_p1)

# копируем ноутбук
assert os.path.exists(NB_SRC), f"Не найден ноутбук: {NB_SRC}"
dst_nb = f"{repo_dir}/project1_data_analysis/notebooks/01_answers_1-11.ipynb"
shutil.copy2(NB_SRC, dst_nb)

# git config + ensure main + push ветки
def sh(cmd):
    subprocess.run(["bash","-lc",cmd], check=True)

sh(f'cd "{repo_dir}" && git config user.name "colab-commit" && git config user.email "colab-commit@example.com"')
# если нет main — создадим и запушим
sh(f'cd "{repo_dir}" && (git rev-parse --verify main >/dev/null 2>&1 || (git checkout -b main && git commit --allow-empty -m "Init main" && git push -u origin main))')
# новая ветка + коммит
sh(f'cd "{repo_dir}" && git checkout -b "{BRANCH}" && git add . && git commit -m "Add structure + 01_answers_1-11.ipynb + READMEs + requirements" && git push -u origin "{BRANCH}"')

print("OK — пуш выполнен.")
print(f"Создай PR: https://github.com/{GH_USER}/{GH_REPO}/compare/{BRANCH}?expand=1")
