<a href="https://colab.research.google.com/github/Airi-Miura/Fricton-signage_app/blob/master/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
url = "/content/drive/MyDrive/上岡研ミニ研究/wifi_log1.csv.xlsx"
df = pd.read_excel(url)

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans


# ================================================
# 0. 既知SSID → 場所ラベル（辞書固定）
# ================================================
ssid_to_place = {
    "E42686D092CF-5G_EXT": "home_A",
    "SRAS5G": "school",
    "tokyokobetu_wifi": "part_time_job"
    # 必要に応じて追加
}


# ================================================
# 朝 / 昼 / 夜 の区分
# ================================================
def assign_time_slot(dt):
    hour = dt.hour
    if 5 <= hour < 12:
        return "morning"
    elif 12 <= hour < 18:
        return "day"
    elif 18 <= hour <24:
        return "night"

    else :
        return "midnight"





# メイン処理
# ================================================
def main():


    # SSID が欠損することがあるので直前のSSIDで補完
    df["SSID"] = df["SSID"].ffill()

    # Date と time を文字列化して結合 → datetime型へ変換
    #DatetimeArray + str
    df["datetime"] = pd.to_datetime(
        df["Date"].astype(str) + " " + df["time"].astype(str)
    )



    # 2. start / end をマージして滞在区間を作る
    # ================================================

    # start の行だけ取り出す
    start_df = df[df["event"] == "start"][["Date", "SSID", "datetime"]]

    # end の行だけ取り出す
    end_df = df[df["event"] == "end"][["Date", "SSID", "datetime"]]

    # 同じ Date × SSID をキーにして縦にくっつける
    merged = pd.merge(
        start_df,
        end_df,
        on=["Date", "SSID"],
        suffixes=("_start", "_end")
    )

    # わかりやすく rename
    merged["start_dt"] = merged["datetime_start"]
    merged["end_dt"]   = merged["datetime_end"]

    # 終了時間が開始時間より前 → 日付またぎ補正
    mask = merged["end_dt"] < merged["start_dt"]
    merged.loc[mask, "end_dt"] += pd.Timedelta(days=1)

    # 滞在時間（分）
    merged["duration_min"] = (merged["end_dt"] - merged["start_dt"]).dt.total_seconds() / 60

    # 0分以下（ノイズ）は削除
    merged = merged[merged["duration_min"] > 0]


    # ================================================
    # 3. SSID → place（辞書固定）
    # ================================================
    # 辞書にあるSSIDだけ変換
    merged["place"] = merged["SSID"].map(ssid_to_place)

    # 辞書にないSSIDはそのままSSID文字列を場所扱いとする
    merged["place"] = merged["place"].fillna(merged["SSID"])



    # 4. 朝/昼/夜の time_slot と date の生成
    # ================================================
    merged["time_slot"] = merged["start_dt"].apply(assign_time_slot)

    merged["date"] = pd.to_datetime(merged["Date"]).dt.date  # yyyy-mm-dd


    # 5. 朝/昼/夜 × 場所ごとの滞在割合
    # ================================================
    slot_place = (
        merged.groupby(["date", "time_slot", "place"], as_index=False)["duration_min"]
        .sum()
        .rename(columns={"duration_min": "slot_place_duration_min"})
    )

    slot_total = (
        slot_place.groupby(["date", "time_slot"], as_index=False)["slot_place_duration_min"]
        .sum()
        .rename(columns={"slot_place_duration_min": "slot_total_duration_min"})
    )

    slot_merged = slot_place.merge(slot_total, on=["date", "time_slot"])
    slot_merged["slot_place_ratio"] = (
        slot_merged["slot_place_duration_min"] / slot_merged["slot_total_duration_min"]
    )


    # 6. 日ごとの特徴量
    # ================================================
    # 1日 × 場所 の滞在時間
    day_place = (
        merged.groupby(["date", "place"], as_index=False)["duration_min"]
        .sum()
        .rename(columns={"duration_min": "day_place_duration_min"})
    )

    # 1日全体の滞在時間
    day_total = (
        day_place.groupby("date", as_index=False)["day_place_duration_min"]
        .sum()
        .rename(columns={"day_place_duration_min": "day_total_duration_min"})
    )

    # 割合を計算
    day_place = day_place.merge(day_total, on="date")
    day_place["day_place_ratio"] = (
        day_place["day_place_duration_min"] / day_place["day_total_duration_min"]
    )

    #滞在時間が最大の place
    day_label = (
        day_place.sort_values(["date", "day_place_duration_min"], ascending=[True, False])
        .groupby("date")
        .first()[["place"]]
        .rename(columns={"place": "main_place_label"})
    ).reset_index()

    # ---------- 日ごとの切替回数など ----------
    def calc_daily_features(group):
        g = group.sort_values("start_dt")
        switches = (g["place"] != g["place"].shift()).sum() - 1
        switches = max(switches, 0)

        return pd.Series({
            "unique_places": group["place"].nunique(),
            "num_switches": switches,
            "mean_duration_min": group["duration_min"].mean(),
            "total_duration_min": group["duration_min"].sum(),
        })

    day_features = merged.groupby("date").apply(calc_daily_features).reset_index()

    # まとめ
    day_summary = day_features.merge(day_label, on="date", how="left")



    # 7. ベクトル化（クラスタリング用）
    # ================================================
    daily = day_summary.set_index("date")

    # 1日 × 場所 の滞在比率
    pivot_day_ratio = day_place.pivot_table(
        index="date",
        columns="place",
        values="day_place_ratio",
        fill_value=0
    ).add_prefix("place_")

    # 朝/昼/夜 × 場所 の比率
    pivot_slot_ratio = slot_merged.pivot_table(
        index="date",
        columns=["time_slot", "place"],
        values="slot_place_ratio",
        fill_value=0
    )
    pivot_slot_ratio.columns = [
        f"{slot}_{place}_ratio" for slot, place in pivot_slot_ratio.columns
    ]

    # 結合
    full = (
        daily
        .join(pivot_day_ratio, how="left")
        .join(pivot_slot_ratio, how="left")
        .fillna(0)
    )



    # 8. OneHotEncoder
    # ================================================
    cat_cols = ["main_place_label"]

    enc = OneHotEncoder(sparse_output=False)

    cat_encoded = enc.fit_transform(full[cat_cols])
    df_cat = pd.DataFrame(cat_encoded, index=full.index, columns=enc.get_feature_names_out(cat_cols))

    # 数値＋OneHot を結合
    X = pd.concat([full.drop(columns=cat_cols), df_cat], axis=1)



    # 9. 標準化
    # ================================================
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)



    # 10. K-means
    # ================================================
    kmeans = KMeans(n_clusters=4, n_init=10, random_state=0)
    labels = kmeans.fit_predict(X_scaled)

    result = pd.DataFrame({"date": X.index, "cluster": labels})
    print(result.head())


    # 11. 保存
    # ================================================
    full.to_csv("daily_features.csv")
    result.to_csv("cluster_result.csv")
    print("処理完了！")


if __name__ == "__main__":
    main()


import matplotlib.pyplot as plt

# 固定カラーマップ
cluster_colors = {
    0: "blue",
    1: "orange",
    2: "green",
    3: "red"
}

plt.figure(figsize=(8, 6))

for cl in sorted(df_pca["cluster"].unique()):
    subset = df_pca[df_pca["cluster"] == cl]
    plt.scatter(
        subset["pc1"],
        subset["pc2"],
        color=cluster_colors[cl],
        label=f"cluster {cl}",
        alpha=0.8,
        s=80
    )

plt.xlabel("PC1", fontsize=12)
plt.ylabel("PC2", fontsize=12)
plt.title("Daily pattern clusters (PCA 2D)", fontsize=14)
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# ============================
# 1. データ読み込み
# ============================
# main() で保存した特徴量テーブル
full = pd.read_csv("daily_features.csv", index_col=0)
# 日付ごとのクラスタ番号
result = pd.read_csv("cluster_result.csv")

# index を datetime にしてそろえる
full.index = pd.to_datetime(full.index)
result["date"] = pd.to_datetime(result["date"])
result = result.set_index("date").loc[full.index]


# 2. PCA 用の特徴量行列を作る
# ============================
# もし full の中にカテゴリ列(main_place_labelなど)があれば落とす
drop_cols = []
for col in ["main_place_label"]:
    if col in full.columns:
        drop_cols.append(col)

X = full.drop(columns=drop_cols)

# 標準化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# 3. PCA で 2次元に圧縮
# ============================
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

df_pca = pd.DataFrame(
    X_pca,
    index=full.index,
    columns=["pc1", "pc2"]
)

# クラスタ & 平日 / 休日フラグを付ける
df_pca["cluster"] = result["cluster"].values
df_pca["is_weekend"] = df_pca.index.weekday >= 5  # 5,6 が土日

print("PCA での分散説明率:", pca.explained_variance_ratio_)

# ============================
# 4. 色 = クラスタ, 形 = 平日/休日 で描画
# ============================
colors = {0: "tab:blue", 1: "tab:orange", 2: "tab:green", 3: "tab:red"}
markers = {False: "o", True: "^"}  # 平日:丸, 休日:三角

plt.figure(figsize=(8, 6))

for cl in sorted(df_pca["cluster"].unique()):
    for is_weekend, marker in markers.items():
        subset = df_pca[(df_pca["cluster"] == cl) & (df_pca["is_weekend"] == is_weekend)]
        if subset.empty:
            continue
        label = f"cluster {cl} - {'weekend' if is_weekend else 'weekday'}"
        plt.scatter(
            subset["pc1"],
            subset["pc2"],
            c=colors[cl],
            marker=marker,
            alpha=0.85,
            edgecolors="k",
            linewidths=0.5,
            label=label
        )

plt.xlabel("PC1")
plt.ylabel("PC2")
plt.title("Daily pattern clusters (PCA 2D, weekday vs weekend)")
plt.grid(True, linestyle="--", alpha=0.4)
plt.legend(bbox_to_anchor=(1.05, 1), loc="upper left")
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np


slot_cols = [c for c in full.columns if "_ratio" in c and ("morning" in c or "day" in c or "night" in c)]

clusters = sorted(result_plot["cluster"].unique())

fig = plt.figure(figsize=(18, 25))

for cl in clusters:
    avg = full.loc[result_plot["cluster"] == cl, slot_cols].mean()

    angles = np.linspace(0, 2*np.pi, len(slot_cols), endpoint=False)
    stats = np.concatenate([avg.values, [avg.values[0]]])
    angles = np.concatenate([angles, [angles[0]]])

    ax = fig.add_subplot(2, 2, cl+1, polar=True)
    ax.plot(angles, stats, marker='o')
    ax.fill(angles, stats, alpha=0.2)
    ax.set_title(f"Cluster {cl}")
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(slot_cols)

plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(14,3))
plt.scatter(result_plot.index, result_plot["cluster"], c=result_plot["cluster"], cmap="tab10")
plt.title("Cluster over time")
plt.xlabel("Date")
plt.ylabel("Cluster")
plt.show()
