In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import BallTree
from sklearn.neighbors import NearestNeighbors

EARTH_RADIUS_KM = 6371.0088

# 데이터 불러오기

In [2]:
def get_data(file_path):
    """
    Reads a CSV file and returns a pandas DataFrame.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: The DataFrame containing the CSV data.
    """
    try:
        df = pd.read_csv(file_path)
        return df
    except Exception as e:
        print(f"Error reading the CSV file: {e}")
        return None

In [3]:
nyc_path = '../data/nyc/raw/NYC_df.csv'
tky_path = '../data/tky/raw/TKY_df.csv'

nyc_df = get_data(nyc_path)
tky_df = get_data(tky_path)

# Trajectory flow Map 생성

In [4]:
def make_POI_traj_graph(df,
                    split_col="SplitTag",
                    split_value="train",
                    user_col="UserId",
                    traj_col="TrajectoryId",
                    poi_col="PoiId",
                    time_col="LocalTime",   # 또는 "UTCTime"
                    drop_self_loops=True,
                    min_weight=1):
    
    # 1) train 필터 + 필요한 컬럼만
    use_cols = [user_col, poi_col, traj_col, time_col, split_col]
    gdf = df.loc[df[split_col] == split_value, use_cols].copy()

    # 2) 시간 정렬 (같은 TrajectoryId 안에서 발생 순서 보장)
    gdf[time_col] = pd.to_datetime(gdf[time_col], errors="coerce")
    gdf = gdf.dropna(subset=[time_col]).sort_values([traj_col, time_col, poi_col])

    # 3) 같은 Trajectory 내 연속 전이 생성 (p_i -> p_{i+1})
    gdf["next_poi"] = gdf.groupby(traj_col)[poi_col].shift(-1)
    edges = gdf.dropna(subset=["next_poi"])[[poi_col, "next_poi"]]   # 마지막 체크인 이후 제외
    edges.columns = ["src", "dst"]

    # 4) self-loop 제거(옵션)
    if drop_self_loops:
        edges = edges[edges["src"] != edges["dst"]]

    # 5) (src,dst)별 횟수 집계 = weight
    edges_df = (
        edges.groupby(["src", "dst"], as_index=False)
             .size()
             .rename(columns={"size": "weight"})
             .sort_values(["src", "dst"])
             .reset_index(drop=True)
    )
    if min_weight > 1:
        edges_df = edges_df[edges_df["weight"] >= min_weight].reset_index(drop=True)

    return edges_df

In [5]:
nyc_traj_graph = make_POI_traj_graph(nyc_df)

nyc_traj_graph.sort_values("weight", ascending=False)

Unnamed: 0,src,dst,weight
25440,4cf1a69a1d18a143dcf754ec,4df53b7bd4c01ff6b2f262ec,71
8926,4a4821f5f964a52095aa1fe3,4840fe6bf964a52030501fe3,66
30002,4e7e6b125503f21e714e6906,4ec1e417f5b9abc8bcde8598,66
7673,4a0c2e7bf964a52014751fe3,43a52546f964a520532c1fe3,60
26119,4d4b5ceb8e948cfa35fcef48,4d2b7e7ab818a35df652b08a,58
...,...,...,...
12028,4ab7e859f964a5206f7b20e3,4b342f9df964a520e52525e3,1
12027,4ab7e859f964a5206f7b20e3,4a43bf53f964a520c1a61fe3,1
12026,4ab7e859f964a5206f7b20e3,4a0ce46ff964a52068751fe3,1
12025,4ab7e859f964a5206f7b20e3,49ecd458f964a52099671fe3,1


# Space Graph 생성

In [6]:
def make_POI_space_graph(df,
                          split_col="SplitTag",
                          split_value="train",
                          poi_col="PoiId",
                          lat_col="Latitude",
                          lon_col="Longitude",
                          radius_km=1.0):
    """
    POI들의 위경도를 받아, 직선거리(대원거리) 1km 이내를 잇는 space graph의 엣지 리스트를 만든다.
    - undirected=True면 (u<v)만 남겨 중복 제거
    - include_distance=True면 distance_km 컬럼을 추가
    """
    # 0) 필요한 컬럼만, 좌표 정리
    use_cols = [poi_col, lat_col, lon_col, split_col]
    gdf = df.loc[df[split_col] == split_value, use_cols].copy()
    gdf = gdf.drop_duplicates(subset=[poi_col])

    # 1) 라디안 변환
    lat = np.deg2rad(gdf[lat_col].to_numpy())
    lon = np.deg2rad(gdf[lon_col].to_numpy())
    X = np.c_[lat, lon]

    # 2) BallTree(haversine) 구축 및 반경 질의
    tree = BallTree(X, metric="haversine")
    r = radius_km / EARTH_RADIUS_KM  # haversine은 '라디안' 거리 → 반경을 라디안으로
    ind, dist = tree.query_radius(X, r=r, return_distance=True, sort_results=True)  # 각 점마다 반경 r 이내에 있는 이웃들을 찾아주는 함수
    
    # 3) 엣지 생성
    srcs, dsts, dists = [], [], []
    poi_ids = gdf[poi_col].to_numpy()
    for i, (nbrs, d) in enumerate(zip(ind, dist)):
        # 자기 자신(거리 0) 제외
        mask = nbrs != i
    
        for j, dij in zip(nbrs[mask], d[mask]):
            srcs.append(poi_ids[i])
            dsts.append(poi_ids[j])
            dists.append(dij * EARTH_RADIUS_KM)

    edges_df = pd.DataFrame({"src": srcs, "dst": dsts})
    
    edges_df['distance_km'] = np.round(dists, 5)
    edges_df["weight"] = 1.0 / (1.0 + edges_df['distance_km'])

    return edges_df

In [7]:
nyc_space_graph_1km = make_POI_space_graph(nyc_df, radius_km=1.0)
nyc_space_graph_05km = make_POI_space_graph(nyc_df, radius_km=0.5)
nyc_space_graph_03km = make_POI_space_graph(nyc_df, radius_km=0.3)
nyc_space_graph_02km = make_POI_space_graph(nyc_df, radius_km=0.2)
nyc_space_graph_01km = make_POI_space_graph(nyc_df, radius_km=0.1)

print(nyc_space_graph_1km.shape)
print(nyc_space_graph_05km.shape)
print(nyc_space_graph_03km.shape)
print(nyc_space_graph_02km.shape)
print(nyc_space_graph_01km.shape)

(611594, 4)
(185502, 4)
(76430, 4)
(39380, 4)
(13464, 4)


# Time Grpah 생성

In [8]:
def make_POI_time_graph(df,
                        split_col="SplitTag",
                        split_value="train",
                        poi_col="PoiId",
                        hour_col="Hour",       # 없으면 time_col로 생성
                        laplace_alpha=0.1,
                        circular_smooth_win=0,   # 0=off, 1이면 t-1,t,t+1 평균
                        feature_norm="l2",     # "prob" or "l2"
                        n_neighbors=20,
                        min_sim=0.2):       # undirected일 때 상호 kNN만
    # 1) 필터
    use_cols = [poi_col, split_col, hour_col]
    gdf = df.loc[df[split_col] == split_value, use_cols].dropna(subset=[poi_col]).copy()

    # 2) 24차 분포 벡터
    cnt = (gdf.groupby([poi_col, hour_col]).size()
           .rename("count").reset_index())
    mat = cnt.pivot(index=poi_col, columns=hour_col, values="count").fillna(0.0)
    mat = mat.reindex(columns=pd.Index(range(24), name=hour_col), fill_value=0.0)

    # 4) 라플라스 or 원형 스무딩
    M = mat.to_numpy(dtype=np.float64)
    if laplace_alpha and laplace_alpha > 0:
        M = M + laplace_alpha

    if circular_smooth_win and circular_smooth_win > 0:
        w = int(circular_smooth_win)
        K = 2*w + 1
        Ms = np.zeros_like(M)
        for s in range(-w, w+1):
            Ms += np.roll(M, shift=s, axis=1)
        M = Ms / K

    # 5) 정규화
    if feature_norm == "prob":
        row_sum = M.sum(axis=1, keepdims=True)
        row_sum[row_sum == 0] = 1.0
        F = M / row_sum
    elif feature_norm == "l2":    # 코사인 유사도는 l2 정규화와 더 잘 맞음
        norm = np.linalg.norm(M, axis=1, keepdims=True)
        norm[norm == 0] = 1.0
        F = M / norm
    else:
        raise ValueError("feature_norm must be 'prob' or 'l2'")

    poi_ids = mat.index.to_numpy()
    time_feat_df = pd.DataFrame(F, index=poi_ids, columns=[f"Hour_{h}" for h in range(24)]).reset_index().rename(columns={poi_col:"poi_id"})

    # 6) kNN (코사인)
    k = min(n_neighbors, F.shape[0])
    nn = NearestNeighbors(n_neighbors=k, metric="cosine")
    nn.fit(F)
    dist, idx = nn.kneighbors(F, return_distance=True)  # dist=1-cos
    sims = 1.0 - dist

    # print("min:", np.min(sims))
    # print("max:", np.max(sims))
    # print("mean:", np.mean(sims))
    # print("median:", np.median(sims))
    # print("std:", np.std(sims))
    # print("="*50)
    
    # 7) 엣지 생성
    rows = []
    N, K = idx.shape
    for i in range(N):
        for kk in range(1, K):  # 자기 자신 제외
            j = idx[i, kk]
            s = float(sims[i, kk])
            if s >= min_sim:
                rows.append((poi_ids[i], poi_ids[j], s))
    edges_df = pd.DataFrame(rows, columns=["src","dst","weight"])

    return edges_df.sort_values(["src","dst"]).reset_index(drop=True), time_feat_df

In [9]:
nyc_time_graph_07, nyc_time_feat = make_POI_time_graph(nyc_df, min_sim=0.7)
nyc_time_graph_08, nyc_time_feat = make_POI_time_graph(nyc_df, min_sim=0.8)
nyc_time_graph_09, nyc_time_feat = make_POI_time_graph(nyc_df, min_sim=0.9)

print(nyc_time_graph_07.shape)
print(nyc_time_graph_08.shape)
print(nyc_time_graph_09.shape)

(86528, 3)
(75075, 3)
(25652, 3)


In [10]:
nyc_time_graph_08

Unnamed: 0,src,dst,weight
0,3fd66200f964a52000e71ee3,3fd66200f964a52020e71ee3,0.894882
1,3fd66200f964a52000e71ee3,41b24f80f964a520661e1fe3,0.896570
2,3fd66200f964a52000e71ee3,42699000f964a52047211fe3,0.942901
3,3fd66200f964a52000e71ee3,485e640df964a520d7501fe3,0.896570
4,3fd66200f964a52000e71ee3,4a3b9ffdf964a520a8a01fe3,0.944565
...,...,...,...
75070,50aeda7b7ab4aecf17b12bd6,40fb0f00f964a520d90a1fe3,0.862947
75071,50aeda7b7ab4aecf17b12bd6,4a14b2acf964a52066781fe3,0.808701
75072,50aeda7b7ab4aecf17b12bd6,4ad507bff964a520660121e3,0.820007
75073,50aeda7b7ab4aecf17b12bd6,4c4ca999b301b7130ad70789,0.821205


In [11]:
nyc_time_feat

Unnamed: 0,index,Hour_0,Hour_1,Hour_2,Hour_3,Hour_4,Hour_5,Hour_6,Hour_7,Hour_8,...,Hour_14,Hour_15,Hour_16,Hour_17,Hour_18,Hour_19,Hour_20,Hour_21,Hour_22,Hour_23
0,3fd66200f964a52000e71ee3,0.032547,0.032547,0.032547,0.032547,0.032547,0.032547,0.032547,0.032547,0.032547,...,0.032547,0.032547,0.032547,0.032547,0.358020,0.683492,0.358020,0.358020,0.032547,0.358020
1,3fd66200f964a52001e81ee3,0.713286,0.191369,0.191369,0.017397,0.017397,0.017397,0.017397,0.017397,0.017397,...,0.017397,0.017397,0.017397,0.017397,0.365342,0.191369,0.365342,0.191369,0.191369,0.017397
2,3fd66200f964a52003e71ee3,0.025126,0.025126,0.025126,0.025126,0.025126,0.025126,0.025126,0.025126,0.276385,...,0.025126,0.025126,0.276385,0.276385,0.025126,0.025126,0.025126,0.025126,0.276385,0.025126
3,3fd66200f964a52004e41ee3,0.198723,0.560038,0.018066,0.018066,0.018066,0.018066,0.018066,0.018066,0.018066,...,0.018066,0.018066,0.018066,0.018066,0.018066,0.198723,0.018066,0.379380,0.560038,0.379380
4,3fd66200f964a52004e61ee3,0.022228,0.022228,0.022228,0.022228,0.022228,0.022228,0.022228,0.022228,0.022228,...,0.022228,0.466782,0.022228,0.022228,0.022228,0.466782,0.244505,0.244505,0.022228,0.022228
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4571,50a5b53be4b0c9588221654f,0.061546,0.061546,0.061546,0.061546,0.061546,0.061546,0.061546,0.061546,0.061546,...,0.061546,0.061546,0.061546,0.677003,0.061546,0.061546,0.061546,0.061546,0.677003,0.061546
4572,50a77716e4b0b5a9492f6f56,0.051031,0.051031,0.051031,0.051031,0.051031,0.051031,0.561341,0.051031,0.051031,...,0.051031,0.561341,0.561341,0.051031,0.051031,0.051031,0.051031,0.051031,0.051031,0.051031
4573,50abeb91e4b04d7b173c0466,0.009702,0.300758,0.009702,0.009702,0.009702,0.106721,0.106721,0.106721,0.009702,...,0.106721,0.300758,0.009702,0.106721,0.106721,0.106721,0.106721,0.106721,0.106721,0.300758
4574,50abeee2e4b0570fe336d7fa,0.044544,0.044544,0.044544,0.044544,0.044544,0.489979,0.044544,0.044544,0.044544,...,0.044544,0.489979,0.044544,0.044544,0.044544,0.489979,0.044544,0.044544,0.044544,0.044544


In [12]:
# 그래프 차수 확인
def avg_degree_undirected(edges_df):
    deg = pd.concat([edges_df['src'], edges_df['dst']]).value_counts()
    return deg.mean()

In [13]:
# 그래프 edge 연결 상태 확인

print(avg_degree_undirected(nyc_traj_graph))
print('='*50)

print(avg_degree_undirected(nyc_space_graph_1km))    # 1km 반경으로 삼으면 너무 차수가 많아짐..
print(avg_degree_undirected(nyc_space_graph_05km))
print(avg_degree_undirected(nyc_space_graph_03km))   
print(avg_degree_undirected(nyc_space_graph_02km))   
print(avg_degree_undirected(nyc_space_graph_01km))   # tky와 맞춰 100m로 설정
print('='*50)

print(avg_degree_undirected(nyc_time_graph_07))
print(avg_degree_undirected(nyc_time_graph_08))
print(avg_degree_undirected(nyc_time_graph_09))

14.909409190371992
272.1219132369299
87.95732574679943
39.27543679342241
21.823219728456635
8.884196634774002
37.81818181818182
33.25581395348837
16.475272960822092


# TKY 하이퍼 파라미터 조정

In [14]:
# Traj 그래프
tky_traj_graph = make_POI_traj_graph(tky_df)

# Space 그래프
tky_space_graph_1km = make_POI_space_graph(tky_df, radius_km=1.0)
tky_space_graph_05km = make_POI_space_graph(tky_df, radius_km=0.5)
tky_space_graph_03km = make_POI_space_graph(tky_df, radius_km=0.3)
tky_space_graph_02km = make_POI_space_graph(tky_df, radius_km=0.2)
tky_space_graph_01km = make_POI_space_graph(tky_df, radius_km=0.1)

# Time 그래프
tky_time_graph_07, tky_time_feat = make_POI_time_graph(tky_df, min_sim=0.7)
tky_time_graph_08, tky_time_feat = make_POI_time_graph(tky_df, min_sim=0.8)
tky_time_graph_09, tky_time_feat = make_POI_time_graph(tky_df, min_sim=0.9)


In [15]:
# 그래프 edge 연결 상태 확인

print(avg_degree_undirected(tky_traj_graph))
print('='*50)

print(avg_degree_undirected(tky_space_graph_1km))    # 1km 반경으로 삼으면 너무 차수가 많아짐..
print(avg_degree_undirected(tky_space_graph_05km))
print(avg_degree_undirected(tky_space_graph_03km))   
print(avg_degree_undirected(tky_space_graph_02km))
print(avg_degree_undirected(tky_space_graph_01km))
print('='*50)

print(avg_degree_undirected(tky_time_graph_07))
print(avg_degree_undirected(tky_time_graph_08))
print(avg_degree_undirected(tky_time_graph_09))

26.842837273991655
271.9591893780573
140.90482361411088
79.28681235870384
47.35987210231814
18.44731424293468
37.86787204450626
36.35751222921034
24.01056105610561


* NYC 
    * traj
    * space: 0.2km
    * time: 0.9

* TKY 
    * traj
    * space: 0.1km
    * time: 0.9

In [16]:
# 최종 그래프 저장

# NYC 그래프 저장
nyc_traj_graph.to_csv('../data/nyc/graph/nyc_traj_graph.csv', index=False)
nyc_space_graph_01km.to_csv('../data/nyc/graph/nyc_space_graph_01km.csv', index=False)
nyc_time_graph_09.to_csv('../data/nyc/graph/nyc_time_graph_09.csv', index=False)

# TKY 그래프 저장
tky_traj_graph.to_csv('../data/tky/graph/tky_traj_graph.csv', index=False)
tky_space_graph_01km.to_csv('../data/tky/graph/tky_space_graph_01km.csv', index=False)
tky_time_graph_09.to_csv('../data/tky/graph/tky_time_graph_09.csv', index=False)