**가설**


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import shap

# ===== 1) 데이터 로드 =====
file_path = "C:/Users/Monte/Desktop/데이터톤/online_shoppers_intention.csv"  # VS Code면 실제 경로로 교체
df = pd.read_csv(file_path)

features = ["ProductRelated", "ProductRelated_Duration", "PageValues", "ExitRates"]
target = "Revenue"

# ===== 2) 가설 검정 (Welch's t-test + Cohen's d) =====
def cohens_d(a, b):
    na, nb = len(a), len(b)
    sa, sb = np.var(a, ddof=1), np.var(b, ddof=1)
    sp = np.sqrt(((na - 1) * sa + (nb - 1) * sb) / (na + nb - 2))
    return (np.mean(a) - np.mean(b)) / sp if sp != 0 else np.nan

buyers = df[df[target] == True]
non_buyers = df[df[target] == False]

rows = []
for feat in features:
    a, b = buyers[feat].astype(float).values, non_buyers[feat].astype(float).values
    t, p = stats.ttest_ind(a, b, equal_var=False)  # Welch
    d = cohens_d(a, b)
    rows.append({
        "Variable": feat,
        "Buyers_Mean": np.mean(a),
        "NonBuyers_Mean": np.mean(b),
        "Mean_Diff": np.mean(a) - np.mean(b),
        "Welch_t": t,
        "p_value": p,
        "Cohen_d": d
    })

hypo_df = pd.DataFrame(rows).sort_values("p_value")
print("=== Welch’s t-test + Cohen’s d (분류 리포트 없이) ===")
print(hypo_df.round(4))

# ===== 3) SHAP 계산을 위한 최소 모델 =====
# (SHAP은 모델이 필요하므로, 설명 전용으로 로지스틱을 적합하지만 성능 지표는 출력 X)
X = df[features]
y = df[target].astype(int)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)

model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# ===== 4) 변수 기여 요약 (계수표) =====
coef_df = pd.DataFrame({
    "Feature": features,
    "Coefficient": model.coef_[0],
    "AbsImportance": np.abs(model.coef_[0])
}).sort_values("AbsImportance", ascending=False)

print("\nLogistic Coefficients")
print(coef_df)

# ===== 5) SHAP Summary Plot =====
# np.bool 패치 (일부 shap 버전 호환)
if not hasattr(np, "bool"): np.bool = np.bool_

explainer = shap.Explainer(model, X_train, feature_names=features)
shap_values = explainer(X_test)

# ===== 초록색 계열 색상 적용 =====
shap.summary_plot(
    shap_values,
    pd.DataFrame(X_test, columns=features),
    feature_names=features,
    cmap="Greens",     #
    show=True
)




In [None]:
import statsmodels.api as sm

# ===== 로지스틱 회귀 적합도 검증 =====
X_const = sm.add_constant(X_scaled)
logit_model = sm.Logit(y, X_const)
result = logit_model.fit(disp=False)

# McFadden’s pseudo R² 계산
llf_model = result.llf
llf_null = result.llnull
pseudo_r2 = 1 - (llf_model / llf_null)

print(f"\n=== McFadden's pseudo R² ===")
print(f"R² = {pseudo_r2:.4f}")


**모델링**


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# === 데이터 불러오기 ===
df = pd.read_csv("C:/Users/Monte/Desktop/데이터톤/online_shoppers_intention.csv")
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

# === 필요한 변수만 ===
cols = ["productrelated", "productrelated_duration", "exitrates", "pagevalues"]
data = df[cols].dropna().copy()

# === 1. 로그 변환 ===
data["prd_log"] = np.log1p(data["productrelated"])
data["dur_log"] = np.log1p(data["productrelated_duration"])
data["pv_log"]  = np.log1p(data["pagevalues"])

# === 2. 스케일링 ===
sc1 = StandardScaler()
sc2 = RobustScaler()

data["prd_scaled"] = sc1.fit_transform(data[["prd_log"]])
data["dur_scaled"] = sc1.fit_transform(data[["dur_log"]])
data["exit_scaled"] = sc2.fit_transform(data[["exitrates"]])
data["pv_scaled"] = sc2.fit_transform(data[["pv_log"]])

# === 3. 최종 feature set ===
X = data[["prd_scaled", "dur_scaled", "exit_scaled", "pv_scaled"]]

# === 4. KMeans 및 Silhouette ===
for k in range(2, 11):
    km = KMeans(n_clusters=k, random_state=42, n_init=100)
    labels = km.fit_predict(X)
    sil = silhouette_score(X, labels)
    print(f"k={k}, silhouette={sil:.3f}")


In [None]:
from sklearn.manifold import TSNE
import seaborn as sns
import matplotlib.pyplot as plt

# 최적 K 정하고 클러스터링
k_opt = 5
km = KMeans(n_clusters=k_opt, random_state=42, n_init=50)
labels = km.fit_predict(X)
data["cluster"] = labels

# 시각화 시작
custom_colors = ["#25C486", "#919191", "#2FA241", "#005E39", "#022020"]

tsne = TSNE(n_components=2, random_state=42, perplexity=50)
X_tsne = tsne.fit_transform(X)

plt.figure(figsize=(8,6))
sns.scatterplot(x=X_tsne[:,0], y=X_tsne[:,1], hue=labels, palette=custom_colors, s=20, alpha=0.7)
plt.title("t-SNE Visualization of Clusters (K=5)")
plt.show()

In [None]:
# === 군집별 빠른 프로파일 ===
import numpy as np
import pandas as pd

# 군집 크기/비율
print("\n[Cluster size/share]")
cnt = data["cluster"].value_counts().sort_index()
print(pd.DataFrame({"count": cnt, "share": (cnt/len(data)).round(3)}))

# 원본 변수 기준 요약 (평균/중앙값)
cols_orig = ["productrelated", "productrelated_duration", "pagevalues", "exitrates"]
print("\n[Cluster profile: mean / median]")
summary = data.groupby("cluster")[cols_orig].agg(["mean","median"]).round(3)
print(summary)

# 간단 파생지표: 페이지당 체류시간, PageValues>0 비율
eps = 1e-9
tmp = data.copy()
tmp["dur_per_page"] = tmp["productrelated_duration"] / (tmp["productrelated"] + eps)
tmp["pv_pos"] = (tmp["pagevalues"] > 0).astype(int)

print("\n[Derived metrics]")
derived = tmp.groupby("cluster").agg(
    dur_per_page_median=("dur_per_page", "median"),
    pv_pos_rate=("pv_pos", "mean"),          # 비율
    exit_median=("exitrates", "median")
).round(3)
print(derived)

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest

# === 1) 데이터 로드 ===
file_path = "C:/Users/Monte/Desktop/데이터톤/online_shoppers_intention.csv"
df = pd.read_csv(file_path)

# === 2) VisitorType → 신규 방문자 여부 변수화 ===
df["is_new"] = (df["VisitorType"].str.strip().str.lower() == "new_visitor").astype(int)

# === 3) 클러스터 라벨 연결 (이전에 KMeans 했던 결과 사용)
# KMeans 수행 시 dropna()로 행이 줄었으므로 index 맞춰야 함
df_clustered = df.loc[df.index.isin(data.index)].copy()
df_clustered["cluster"] = labels

# === 4) 클러스터별 신규 방문자 비율 계산 ===
summary = df_clustered.groupby("cluster")["is_new"].agg(["mean", "count", "sum"])
summary["new_ratio(%)"] = (summary["mean"] * 100).round(2)
print("=== 클러스터별 신규 방문자 비율 ===")
print(summary)

# === 5) 카이제곱 검정 (전체 군집 간 비교) ===
contingency = pd.crosstab(df_clustered["cluster"], df_clustered["is_new"])
chi2, p, dof, exp = stats.chi2_contingency(contingency)
print(f"\n[카이제곱 검정] χ²={chi2:.3f}, df={dof}, p-value={p:.5f}")
if p < 0.05:
    print("→ 유의함: 클러스터 간 신규 방문자 비율에 유의한 차이가 있습니다.")
else:
    print("→ 유의하지 않음: 클러스터 간 신규 방문자 비율 차이는 통계적으로 동일합니다.")

# === 6) Cluster 3 vs 4 두 집단 간 비율 검정 (선택) ===
cluster_a, cluster_b = 3, 4
a_success = df_clustered.loc[df_clustered["cluster"] == cluster_a, "is_new"].sum()
a_n = df_clustered.loc[df_clustered["cluster"] == cluster_a, "is_new"].count()
b_success = df_clustered.loc[df_clustered["cluster"] == cluster_b, "is_new"].sum()
b_n = df_clustered.loc[df_clustered["cluster"] == cluster_b, "is_new"].count()

z, p_z = proportions_ztest([a_success, b_success], [a_n, b_n])
print(f"\n[두 군집 비율검정] Cluster {cluster_a} vs {cluster_b}: z={z:.3f}, p-value={p_z:.5f}")
if p_z < 0.05:
    print(f"→ Cluster {cluster_a}와 {cluster_b}의 신규 방문자 비율은 유의하게 다릅니다.")
else:
    print(f"→ Cluster {cluster_a}와 {cluster_b}의 신규 방문자 비율은 통계적으로 유의하지 않습니다.")


In [None]:
from scipy import stats

# === Kruskal–Wallis 비모수 검정 ===
groups = [df_clustered.loc[df_clustered["cluster"] == c, "specialday"] for c in sorted(df_clustered["cluster"].unique())]
H, p = stats.kruskal(*groups)

print(f"Kruskal–Wallis H = {H:.3f}, p = {p:.5f}")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# === (1) 군집 크기 ===
plt.figure(figsize=(6, 4))
sns.barplot(x=cnt.index, y=cnt.values, palette="tab10")
plt.title("Cluster Size Distribution", fontsize=14)
plt.xlabel("Cluster")
plt.ylabel("Count")
plt.show()

# === (2) 원본 변수 프로파일 (평균 / 중앙값 비교) ===
mean_df = summary.xs('mean', level=1, axis=1)
median_df = summary.xs('median', level=1, axis=1)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))
sns.heatmap(mean_df, annot=True, fmt=".2f", cmap="Greens", ax=axes[0])
axes[0].set_title("Cluster Mean Profile", fontsize=14)

sns.heatmap(median_df, annot=True, fmt=".2f", cmap="Blues", ax=axes[1])
axes[1].set_title("Cluster Median Profile", fontsize=14)

plt.tight_layout()
plt.show()

# === (3) 파생지표 시각화 (평균 vs 중앙값 비교) ===
# 중앙값 버전은 이미 derived에 있고, 평균 버전 추가
derived_mean = tmp.groupby("cluster").agg(
    dur_per_page_mean=("dur_per_page", "mean"),
    pv_pos_rate=("pv_pos", "mean"),
    exit_mean=("exitrates", "mean")
).round(3)

fig, axes = plt.subplots(2, 3, figsize=(15, 8))

# ---- 중앙값 기준 ----
sns.barplot(x=derived.index, y=derived["dur_per_page_median"], ax=axes[0, 0], palette="Blues")
axes[0, 0].set_title("Median Duration per Page")

sns.barplot(x=derived.index, y=derived["pv_pos_rate"], ax=axes[0, 1], palette="Greens")
axes[0, 1].set_title("PageValues>0 Rate (Median Base)")

sns.barplot(x=derived.index, y=derived["exit_median"], ax=axes[0, 2], palette="Reds")
axes[0, 2].set_title("Median Exit Rate")

# ---- 평균 기준 ----
sns.barplot(x=derived_mean.index, y=derived_mean["dur_per_page_mean"], ax=axes[1, 0], palette="Blues")
axes[1, 0].set_title("Mean Duration per Page")

sns.barplot(x=derived_mean.index, y=derived_mean["pv_pos_rate"], ax=axes[1, 1], palette="Greens")
axes[1, 1].set_title("PageValues>0 Rate (Mean Base)")

sns.barplot(x=derived_mean.index, y=derived_mean["exit_mean"], ax=axes[1, 2], palette="Reds")
axes[1, 2].set_title("Mean Exit Rate")

for ax in axes.flat:
    ax.set_xlabel("Cluster")
    ax.set_ylabel("Value")

plt.tight_layout()
plt.show()



In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# === 클러스터링에 사용하지 않은 변수 선택 ===
excluded_cols = ["ProductRelated", "ProductRelated_Duration", "PageValues", "ExitRates", "cluster"]
cols_to_check = [c for c in df.columns if c not in excluded_cols]

# 데이터프레임 병합 (cluster 포함)
data_vis = df.copy()
data_vis["cluster"] = data["cluster"].values  # 이미 클러스터링 결과가 있다고 가정

# === 1) 수치형 변수 (mean 기준 barplot) ===
num_cols = data_vis.select_dtypes(include=[np.number, "bool"]).columns.difference(["cluster"])
fig, axes = plt.subplots(len(num_cols)//3 + 1, 3, figsize=(15, 4*(len(num_cols)//3 + 1)))
axes = axes.flatten()

for i, col in enumerate(num_cols):
    sns.barplot(data=data_vis, x="cluster", y=col, estimator=np.mean, palette="tab10", ax=axes[i])
    axes[i].set_title(f"{col} (mean)", fontsize=11)
    axes[i].set_xlabel("Cluster")
    axes[i].set_ylabel("Mean Value")

for j in range(i+1, len(axes)):
    axes[j].set_visible(False)
plt.tight_layout()
plt.show()

# === 2) 범주형 변수 (비율 기반 barplot) ===
cat_cols = data_vis.select_dtypes(include=["object"]).columns

for col in cat_cols:
    plt.figure(figsize=(7,4))
    prop = pd.crosstab(data_vis["cluster"], data_vis[col], normalize="index")
    prop.plot(kind="bar", stacked=True, colormap="tab20", ax=plt.gca())
    plt.title(f"{col} Distribution by Cluster")
    plt.ylabel("Proportion")
    plt.xlabel("Cluster")
    plt.legend(title=col, bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# 0) 데이터 로드가 이미 되어 있다고 가정: df
#    (필요시) df = pd.read_csv("/mnt/data/online_shoppers_intention.csv")

df_work = df.copy()

# 1) 군집 라벨 자동 확보
labels = None
# 1-a) data['cluster'] 재사용
if "data" in globals():
    try:
        if isinstance(data, pd.DataFrame) and "cluster" in data.columns and len(data) == len(df_work):
            labels = data["cluster"].to_numpy()
            print("[INFO] Using cluster labels from data['cluster']")
    except Exception:
        pass

# 1-b) 전역 라벨 후보 변수 재사용
if labels is None:
    for name in ["labels", "y_cluster", "clusters", "km_labels"]:
        if name in globals():
            cand = globals()[name]
            try:
                arr = np.asarray(cand).reshape(-1)
                if len(arr) == len(df_work):
                    labels = arr
                    print(f"[INFO] Using cluster labels from global '{name}'")
                    break
            except Exception:
                continue

# 1-c) 없으면 즉석 KMeans로 생성 (가능한 피처만 선택)
if labels is None:
    base_feats_pref = ["ProductRelated", "ProductRelated_Duration", "PageValues", "ExitRates"]
    feats = [c for c in base_feats_pref if c in df_work.columns]
    if len(feats) < 2:
        # 숫자형에서 임의로 확장(최대 6개)
        extra = [c for c in df_work.select_dtypes(include=[np.number]).columns if c not in feats]
        feats = list(dict.fromkeys(feats + extra))[:6]
    X = df_work[feats].replace([np.inf, -np.inf], np.nan).fillna(df_work[feats].median())
    Xs = StandardScaler().fit_transform(X)
    km = KMeans(n_clusters=5, random_state=42, n_init=50)
    labels = km.fit_predict(Xs)
    print(f"[INFO] Created on-the-fly KMeans labels (k=5) using features: {feats}")

# df_work에 cluster 부여(원본 df는 그대로 보존)
df_work["cluster"] = labels

# 2) 파생변수 제외한 “원본 변수” 선택
exclude_keywords = ["dur_per", "per_", "ratio", "flag", "log", "score", "scaled", "z", "pos"]
base_cols = [
    c for c in df_work.select_dtypes(include=["number", "bool"]).columns
    if c.lower() != "cluster" and not any(k in c.lower() for k in exclude_keywords)
]

if not base_cols:
    raise ValueError("파생변수를 제외하고 그릴 수 있는 수치/불리언 기본 변수가 없습니다.")

# 3) 클러스터별 평균 계산
profile = df_work.groupby("cluster")[base_cols].mean()

# 4) 막대그래프 함수 (군집 하나용)
def plot_cluster_bar(profile_df, cluster_id, palette="Greens"):
    if cluster_id not in profile_df.index:
        print(f"[WARN] Cluster {cluster_id} not found. Available: {list(profile_df.index)}")
        return
    s = profile_df.loc[cluster_id].sort_values(ascending=False)

    plt.figure(figsize=(12, 6))
    sns.barplot(x=s.index, y=s.values, palette=palette)
    plt.title(f"Cluster {cluster_id} — Mean of Base Variables", fontsize=14)
    plt.xticks(rotation=75)
    plt.ylabel("Mean Value")
    plt.grid(axis='y', linestyle='--', alpha=0.4)

    # 값 라벨(소수 3자리)
    for i, v in enumerate(s.values):
        plt.text(i, v + (0.01 if v >= 0 else -0.02), f"{v:.3f}",
                 ha='center', va='bottom' if v >= 0 else 'top', fontsize=9)
    plt.tight_layout()
    plt.show()

# 5) Cluster 3, 4 각각 출력
plot_cluster_bar(profile, 3, palette="Greens_r")
plot_cluster_bar(profile, 4, palette="Greens_r")
