In [6]:
import pandas as pd
from datetime import datetime, timedelta
from my_preprocess_fn import id_encode, ignore_first, only_keep_last
from file_reader import FileReader

# 전처리 내용
* 각 데이터 열 이름 매칭
* 각 데이터별 상위 카테고리 추가 -> {'Residential', 'Commercial/Services', 'Educational', 'Transportation', 'Culture & Leisure', 'Healthcare & Welfare'}
* 각 데이터별 요일 정보 추가 -> 쉬는 날만 구분
* 각 check-in 시간대 조정 -> 하루중 상대시간으로 설정
* 각 check-in 단위 trajectory 생성

# NYC 데이터

In [None]:
nyc_df = FileReader.read_dataset(file_name='dataset_TSMC2014_NYC.txt', dataset_name='nyc')

nyc_df = FileReader.do_filter(nyc_df, poi_min_freq=10, user_min_freq=10)

In [3]:
nyc_df

Unnamed: 0,UserId,PoiId,PoiCategoryId,PoiCategoryName,Latitude,Longitude,TimezoneOffset,UTCTime,UTCTimeOffset,UTCTimeOffsetEpoch,UTCTimeOffsetWeekday,UTCTimeOffsetHour,UTCTimeOffsetDay,UserRank
0,470,49bbd6c0f964a520f4531fe3,4bf58dd8d48988d127951735,Arts & Crafts Store,40.719810,-74.002581,-240,2012-04-03 18:00:09,2012-04-03 14:00:09,1333429209,1,14,2012-04-03,1.0
1,979,4a43c0aef964a520c6a61fe3,4bf58dd8d48988d1df941735,Bridge,40.606800,-74.044170,-240,2012-04-03 18:00:25,2012-04-03 14:00:25,1333429225,1,14,2012-04-03,1.0
2,69,4c5cc7b485a1e21e00d35711,4bf58dd8d48988d103941735,Home (private),40.716162,-73.883070,-240,2012-04-03 18:02:24,2012-04-03 14:02:24,1333429344,1,14,2012-04-03,1.0
6,642,4ab966c3f964a5203c7f20e3,4bf58dd8d48988d1e0931735,Coffee Shop,40.751591,-73.974121,-240,2012-04-03 18:04:38,2012-04-03 14:04:38,1333429478,1,14,2012-04-03,1.0
8,428,4ce1863bc4f6a35d8bd2db6c,4bf58dd8d48988d103941735,Home (private),40.619151,-74.035888,-240,2012-04-03 18:06:18,2012-04-03 14:06:18,1333429578,1,14,2012-04-03,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227419,138,4df2f04dd4c01ff6b2eaca87,4f2a25ac4b909258e854f55f,Neighborhood,40.663965,-73.910897,-300,2013-02-16 02:27:21,2013-02-15 21:27:21,1360931241,4,21,2013-02-15,608.0
227420,138,4bea96e76295c9b6908b8608,4bf58dd8d48988d1fd931735,Subway,40.661477,-73.916294,-300,2013-02-16 02:27:41,2013-02-15 21:27:41,1360931261,4,21,2013-02-15,609.0
227422,994,45850853f964a5209f3f1fe3,4bf58dd8d48988d157941735,American Restaurant,40.679841,-73.974788,-300,2013-02-16 02:29:11,2013-02-15 21:29:11,1360931351,4,21,2013-02-15,127.0
227423,688,3fd66200f964a52000e71ee3,4bf58dd8d48988d1e7931735,Music Venue,40.733596,-74.003139,-300,2013-02-16 02:29:11,2013-02-15 21:29:11,1360931351,4,21,2013-02-15,355.0


* NYC 추가 전처리

In [4]:
# 1. 열이름 매칭
def make_same_column_names(df):
    df.rename(columns={
        'UTCTimeOffset': 'LocalTime',
    }, inplace=True)
    return df

# 2. 공휴일 정보 추가 및 시간대 확인
def add_holidays_nyc(df):
    df['LocalTime'] = pd.to_datetime(df['LocalTime'])

    # 미국 공휴일 정의 (2012년 4월~2013년 2월)
    us_holidays = [
        datetime(2012, 4, 6).date(),  # Good Friday
        datetime(2012, 5, 28).date(), # Memorial Day
        datetime(2012, 7, 4).date(),  # Independence Day
        datetime(2012, 9, 3).date(),  # Labor Day
        datetime(2012, 10, 8).date(), # Columbus Day
        datetime(2012, 11, 12).date(),# Veterans Day (대체휴일)
        datetime(2012, 11, 22).date(),# Thanksgiving
        datetime(2012, 12, 25).date(),# Christmas
        datetime(2013, 1, 1).date(),  # New Year's Day
        datetime(2013, 1, 21).date(), # Martin Luther King Jr. Day
        datetime(2013, 2, 18).date()  # Presidents’ Day
    ]
    
    # 공휴일 정보 추가
    df['is_weekend'] = df['LocalTime'].dt.weekday.isin([5, 6])  # 토요일(5), 일요일(6)
    df['is_holiday'] = df['LocalTime'].dt.date.isin(us_holidays)

    df['Holiday'] = df['is_weekend'] | df['is_holiday']
    df.drop(columns=['is_weekend', 'is_holiday'], inplace=True)
    
    return df

# 3. 상위 카테고리 맵핑
final_category_mapping = {
    'Residential': [
        'Home (private)', 'Neighborhood', 'Residential Building (Apartment / Condo)', 
        'Housing Development', 'Hotel'
    ],
    'Commercial/Services': [
        'Food & Drink Shop', 'Burger Joint', 'Coffee Shop', 'Ice Cream Shop',
        'Deli / Bodega', 'Mexican Restaurant', 'American Restaurant', 'BBQ Joint',
        'Fast Food Restaurant', 'Bar', 'Cupcake Shop', 'Candy Store', 'Pizza Place',
        'Sandwich Place', 'German Restaurant', 'Latin American Restaurant', 'Café',
        'Breakfast Spot', 'Malaysian Restaurant', 'Diner', 'Bakery', 'Fried Chicken Joint',
        'Snack Place', 'Seafood Restaurant', 'Salad Place', 'Wings Joint', 'Japanese Restaurant',
        'Falafel Restaurant', 'Middle Eastern Restaurant', 'Asian Restaurant', 'Beer Garden',
        'Ramen /  Noodle House', 'Hot Dog Joint', 'Cajun / Creole Restaurant', 'Mac & Cheese Joint',
        'Korean Restaurant', 'Sushi Restaurant', 'Gastropub', 'Caribbean Restaurant', 
        'African Restaurant', 'Cuban Restaurant', 'Indian Restaurant', 'Dessert Shop',
        'Thai Restaurant', 'Soup Place', 'Taco Place', 'Steakhouse', 'Dumpling Restaurant',
        'Vietnamese Restaurant', 'Southern / Soul Food Restaurant', 'Tapas Restaurant',
        'Filipino Restaurant', 'Brazilian Restaurant', 'Australian Restaurant', 
        'Eastern European Restaurant', 'Swiss Restaurant', 'Dim Sum Restaurant',
        'Mobile Phone Shop', 'Automotive Shop', 'Clothing Store', 'Electronics Store', 
        'Tattoo Parlor', 'Department Store', 'Hardware Store', 'Bookstore', 'Toy / Game Store',
        'Miscellaneous Shop', 'Furniture / Home Store', 'Bridal Shop', 'Paper / Office Supplies Store',
        'Convenience Store', 'Hobby Shop', 'Pet Store', 'Jewelry Store', 'Camera Store', 
        'Thrift / Vintage Store', 'Antique Shop', 'Market', 'Flea Market', 'Garden Center',
        'Salon / Barbershop', 'Cosmetics Shop', 'Bank', 'Financial or Legal Service',
        'Professional & Other Places', 'Design Studio', 'Laundry Service', 'Smoke Shop',
        'Post Office', 'Tattoo Parlor', 'Tanning Salon', 'Government Building', 'Office',
        'Other Nightlife', 'Building', 'Spanish Restaurant', 'Factory', 'Burrito Place',
        'Chinese Restaurant', 'Bagel Shop', 'Vegetarian / Vegan Restaurant', 'Donut Shop',
        'Sporting Goods Shop', 'French Restaurant', 'Italian Restaurant', 'Food Truck', 'Restaurant',
        'Tea Room', 'Brewery', 'Recycling Facility', 'Mediterranean Restaurant', 'Gift Shop', 'Food',
        'South American Restaurant', 'Molecular Gastronomy Restaurant', 'Scandinavian Restaurant',
        'Military Base', 'City'
    ],
    'Educational': [
        'Student Center', 'University', 'College Academic Building', 'Community College',
        'General College & University', 'College & University', 'Library', 'Law School', 
        'Trade School', 'Nursery School', 'Elementary School', 'Middle School', 
        'High School', 'College Stadium', 'School'
    ],
    'Transportation': [
        'Subway', 'Bus Station', 'Light Rail', 'Airport', 'Train Station', 'Parking', 
        'General Travel', 'Rental Car Location', 'Taxi', 'Ferry', 'Road', 'Harbor / Marina',
        'Bridge', 'Gas Station / Garage', 'River', 'Travel', 'Travel & Transport', 'Moving Target'
    ],
    'Culture & Leisure': [
        'Arts & Crafts Store', 'Music Venue', 'Movie Theater', 'Scenic Lookout', 'Theater', 
        'General Entertainment', 'Bowling Alley', 'Arcade', 'Comedy Club', 'Museum', 
        'Performing Arts Venue', 'Event Space', 'Art Museum', 'Concert Hall', 'Zoo', 
        'Aquarium', 'Casino', 'Science Museum', 'Racetrack', 'Fair', 'Music Store',
        'Stadium', 'Art Gallery', 'Park', 'Campground', 'Other Great Outdoors',
        'Beach', 'Playground', 'Pool Hall', 'Plaza', 'Outdoors & Recreation', 
        'Sculpture Garden', 'Garden', 'Travel Lounge', 'Rest Area', 'Convention Center',
        'Historic Site', 'Mall', 'Synagogue', 'Church', 'Cemetery', 'Temple', 'Shrine',
        'Arts & Entertainment', 'Spiritual Center'
    ],
    'Healthcare & Welfare': [
        'Gym / Fitness Center', 'Medical Center', 'Drugstore / Pharmacy', 'Spa / Massage',
        'Athletic & Sport', 'Pool', 'Animal Shelter', 'Funeral Home'
    ]
}

# 매핑 함수 
def map_to_final_category(category_name):
    for key, values in final_category_mapping.items():
        if category_name in values:
            return key
    return 'Else'

# 24시간 단위의 trajectory로 변환
def make_trajectory_id(df, time_window=timedelta(hours=24)):
    """사용자 별로 24시간 단위로 trajectory ID를 생성
    """
    df = df.copy()
    df = df.sort_values(by=['UserId', 'LocalTime'])
    
    traj_idxs = []
    
    for user_id, user_df in df.groupby('UserId'):
        user_df = user_df.sort_values(by='LocalTime')    # 시간순 정렬
        start_time = user_df.iloc[0]['LocalTime']
        end_time = start_time + time_window
        
        traj_idx = 1
        
        for idx, row in user_df.iterrows():
            if row['LocalTime'] < end_time:
                traj_idxs.append(f"{user_id}_{traj_idx}")
            else:
                traj_idx += 1
                start_time = row['LocalTime']
                end_time = start_time + time_window
                traj_idxs.append(f"{user_id}_{traj_idx}")
    
    df['TrajectoryId'] = traj_idxs
    return df

In [5]:
# 1. 열 이름 일치
nyc_df = make_same_column_names(nyc_df)

# 2. 공휴일 정보 추가 및 시간대 확인
nyc_df = add_holidays_nyc(nyc_df)

# 3. 상위 카테고리 추가
nyc_df['UpperCategory'] = nyc_df['PoiCategoryName'].apply(map_to_final_category)

# 4. 데이터 분할 및 저장
nyc_df = FileReader.split_train_test(nyc_df)

nyc_train = nyc_df[nyc_df['SplitTag'] == 'train'] 
nyc_val = nyc_df[nyc_df['SplitTag'] == 'validation']
nyc_test = nyc_df[nyc_df['SplitTag'] == 'test']

# 5. Trajectory ID 생성 
nyc_train = make_trajectory_id(nyc_train)
nyc_val = make_trajectory_id(nyc_val)
nyc_test = make_trajectory_id(nyc_test)

In [6]:
# 저장
nyc_train.to_csv('../data/nyc/raw/NYC_train.csv', index=False)
nyc_val.to_csv('../data/nyc/raw/NYC_val.csv', index=False)
nyc_test.to_csv('../data/nyc/raw/NYC_test.csv', index=False)

# TKY 데이터

In [7]:
tky_df = FileReader.read_dataset(file_name='dataset_TSMC2014_TKY.txt', dataset_name='tky')

# 필터링 적용
tky_df = FileReader.do_filter(tky_df, poi_min_freq=10, user_min_freq=10)

* TKY 추가 전처리

In [8]:
# 1. 열이름 일치
tky_df.rename(columns={'UTCTimeOffset': 'LocalTime'}, inplace=True)

# 2. 공휴일 정보 추가 및 시간대 확인
def add_holidays_tky(df):
    df['LocalTime'] = pd.to_datetime(df['LocalTime'])  # 또는 다른 시간 컬럼
    
    japan_holidays = [
        # 2012
        datetime(2012, 4, 29).date(), datetime(2012, 4, 30).date(),
        datetime(2012, 5, 3).date(), datetime(2012, 5, 4).date(), datetime(2012, 5, 5).date(),
        datetime(2012, 7, 16).date(), datetime(2012, 9, 17).date(), datetime(2012, 9, 22).date(),
        datetime(2012, 10, 8).date(), datetime(2012, 11, 3).date(), datetime(2012, 11, 23).date(),
        datetime(2012, 12, 23).date(), datetime(2012, 12, 24).date(),
        # 2013
        datetime(2013, 1, 1).date(), datetime(2013, 1, 14).date(), datetime(2013, 2, 11).date()
    ]
    
    df['date_only'] = df['LocalTime'].dt.date
    df['is_weekend'] = df['LocalTime'].dt.weekday.isin([5, 6])  # 토/일
    df['is_holiday'] = df['date_only'].isin(japan_holidays)
    
    # 주말 or 공휴일
    df['Holiday'] = df['is_weekend'] | df['is_holiday']
    
    # 하루 기준 상대 시간 추가
    df['NormInDayTime'] = (df['LocalTime'].dt.hour * 3600 + df['LocalTime'].dt.minute * 60 + df['LocalTime'].dt.second) / (24*3600)  # 하루를 1로 정규화
    
    # 정리
    df.drop(columns=['is_weekend', 'is_holiday', 'date_only'], inplace=True)
    
    return df


final_category_mapping_tky = {
    'Residential': [
        'Neighborhood', 'Home (private)', 'Residential Building (Apartment / Condo)',
        'Housing Development', 'Sorority House'
    ],
    'Commercial/Services': [
        'Convention Center', 'Japanese Restaurant', 'Electronics Store', 'Cafï¿½',
        'Fast Food Restaurant', 'Convenience Store', 'Paper / Office Supplies Store',
        'Chinese Restaurant', 'Office', 'Bookstore', 'Hobby Shop', 'Bar',
        'Miscellaneous Shop', 'Toy / Game Store', 'Ramen /  Noodle House', 'Smoke Shop',
        'Shrine', 'Plaza', 'Building', 'Italian Restaurant', 'General Entertainment',
        'Clothing Store', 'Hardware Store', 'Coffee Shop', 'Fried Chicken Joint',
        'Food & Drink Shop', 'Dessert Shop', 'Restaurant', 'Mall', 'Bakery',
        'Indian Restaurant', 'Post Office', 'Government Building',
        'Drugstore / Pharmacy', 'Diner', 'Soup Place', 'Burger Joint', 'Racetrack',
        'Department Store', 'Record Shop', 'Music Venue', 'General Travel',
        'Furniture / Home Store', 'Camera Store', 'Sushi Restaurant', 'Hotel',
        'Arts & Crafts Store', 'Bike Shop', 'Mobile Phone Shop', 'Recycling Facility',
        'Antique Shop', 'Donut Shop', 'Deli / Bodega', 'Ice Cream Shop', 'Asian Restaurant',
        'Steakhouse', 'Video Store', 'Video Game Store', 'Dumpling Restaurant',
        'Sandwich Place', 'Internet Cafe', 'Military Base', 'Sporting Goods Shop',
        'Bank', 'Music Store', 'Travel Lounge', 'Seafood Restaurant',
        'Travel & Transport', 'Breakfast Spot', 'Gift Shop', 'Athletic & Sport',
        'Pizza Place', 'BBQ Joint', 'Gaming Cafe', 'Salon / Barbershop', 'Hot Dog Joint',
        'American Restaurant', 'Brewery', 'Harbor / Marina', 'Middle Eastern Restaurant',
        'Automotive Shop', 'Fish & Chips Shop', 'Comedy Club', 'Gastropub',
        'Scenic Lookout', 'Caribbean Restaurant', 'Shop & Service', 'French Restaurant',
        'Thai Restaurant', 'Brazilian Restaurant', 'Moving Target', 'Laundry Service',
        'Flower Shop', 'River', 'Spiritual Center', 'Playground', 'Mexican Restaurant',
        'Car Dealership', 'Candy Store', 'Food', 'Motorcycle Shop', 'Wings Joint',
        'Tea Room', 'Board Shop', 'Mediterranean Restaurant', 'Tanning Salon',
        'Food Truck', 'Thrift / Vintage Store', 'Pool', 'Embassy / Consulate',
        'Snack Place', 'Professional & Other Places', 'Korean Restaurant',
        'Cosmetics Shop', 'Factory', 'Pet Store', 'Bike Rental / Bike Share'
    ],
    'Educational': [
        'University', 'College Academic Building', 'General College & University',
        'Student Center', 'School', 'High School', 'Community College', 'Trade School',
        'Elementary School', 'Medical School', 'College & University', 'Nursery School',
        'College Stadium'
    ],
    'Transportation': [
        'Train Station', 'Subway', 'Bus Station', 'Road', 'Light Rail',
        'Gas Station / Garage', 'Rest Area', 'Parking', 'Airport', 'Ferry', 'Taxi',
        'Bridge'
    ],
    'Culture & Leisure': [
        'Event Space', 'Stadium', 'Arcade', 'Temple', 'Gym / Fitness Center', 'Park',
        'Other Great Outdoors', 'Spa / Massage', 'Movie Theater', 'Sculpture Garden',
        'Aquarium', 'Zoo', 'Art Museum', 'Performing Arts Venue', 'Library',
        'Science Museum', 'Church', 'Historic Site', 'History Museum', 'Bowling Alley',
        'Garden', 'Concert Hall', 'Casino', 'Other Nightlife', 'Art Gallery',
        'Beer Garden', 'Theater', 'Museum', 'Beach', 'Public Art', 'Garden Center',
        'Outdoors & Recreation', 'Nightlife Spot', 'Cemetery'
    ],
    'Healthcare & Welfare': [
        'Medical Center'
    ]
}

def map_to_category(category_name):
    for key, values in final_category_mapping_tky.items():
        if category_name in values:
            return key
    return 'Else'

In [9]:
# 1. 열 이름 일치
tky_df = make_same_column_names(tky_df)

# 2. 공휴일 정보 추가 및 시간대 확인
tky_df = add_holidays_tky(tky_df)

# 3. 상위 카테고리 추가
tky_df['UpperCategory'] = tky_df['PoiCategoryName'].apply(map_to_category)

# 4. 데이터 분할 및 저장
tky_df = FileReader.split_train_test(tky_df)

tky_train = tky_df[tky_df['SplitTag'] == 'train'] 
tky_val = tky_df[tky_df['SplitTag'] == 'validation']
tky_test = tky_df[tky_df['SplitTag'] == 'test']

# 5. Trajectory ID 생성
tky_train = make_trajectory_id(tky_train)
tky_val = make_trajectory_id(tky_val)
tky_test = make_trajectory_id(tky_test)

In [10]:
# 저장
tky_train.to_csv('../data/tky/raw/TKY_train.csv', index=False)
tky_val.to_csv('../data/tky/raw/TKY_val.csv', index=False)
tky_test.to_csv('../data/tky/raw/TKY_test.csv', index=False)

In [11]:
여기까지

NameError: name '여기까지' is not defined

# CA 데이터

In [None]:
ca_df = FileReader.read_dataset(file_name='dataset_gowalla_ca_ne.csv', dataset_name='ca')
ca_df

Unnamed: 0,UserId,PoiId,PoiCategoryId,Latitude,Longitude,UTCTime,UTCTimeOffset,PoiCategoryName,UTCTimeOffsetEpoch,UTCTimeOffsetWeekday,UTCTimeOffsetHour,UTCTimeOffsetDay,UserRank
0,0,19542,45,37.616356,-122.386150,2010-08-19T21:59:09Z,2010-08-19 21:59:09,Airport,1282222749,3,21,2010-08-19,20.0
1,0,19542,45,37.616356,-122.386150,2010-06-24T14:27:35Z,2010-06-24 14:27:35,Airport,1277357255,3,14,2010-06-24,13.0
2,0,19542,45,37.616356,-122.386150,2010-06-06T18:48:32Z,2010-06-06 18:48:32,Airport,1275817712,6,18,2010-06-06,1.0
3,4,19542,45,37.616356,-122.386150,2010-06-19T15:37:36Z,2010-06-19 15:37:36,Airport,1276929456,5,15,2010-06-19,73.0
4,8,19542,45,37.616356,-122.386150,2010-10-07T03:20:57Z,2010-10-07 03:20:57,Airport,1286389257,3,3,2010-10-07,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
636507,196561,2657838,457,37.526711,-122.000513,2010-09-21T22:38:47Z,2010-09-21 22:38:47,Burger King,1285076327,1,22,2010-09-21,37.0
636508,196561,4062193,160,37.522527,-122.005202,2010-09-19T17:57:42Z,2010-09-19 17:57:42,Indian,1284886662,6,17,2010-09-19,31.0
636509,196561,3793703,38,37.530799,-122.015335,2010-09-16T00:17:48Z,2010-09-16 00:17:48,City Park,1284563868,3,0,2010-09-16,8.0
636510,196561,1109654,15,37.523095,-122.004770,2010-09-15T21:39:42Z,2010-09-15 21:39:42,Mexican,1284554382,2,21,2010-09-15,5.0


In [None]:
ca_df = FileReader.do_filter(ca_df, poi_min_freq=10, user_min_freq=10)

# 1. 열이름 일치
ca_df.rename(columns={'UTCTimeOffset': 'LocalTime'}, inplace=True)

ca_df = FileReader.split_train_test(ca_df)

train_ca = ca_df[ca_df['SplitTag'] == 'train'] 
val_ca = ca_df[ca_df['SplitTag'] == 'validation']
test_ca = ca_df[ca_df['SplitTag'] == 'test']

train_ca.to_csv('../data/ca/raw/CA_train.csv', index=False)
val_ca.to_csv('../data/ca/raw/CA_val.csv', index=False)
test_ca.to_csv('../data/ca/raw/CA_test.csv', index=False)