### Read Data

In [1]:
import pandas as pd
import os
dfpath = os.path.join('preprocessed', '2022_season2_processed.csv')
df = pd.read_csv(dfpath,
                 index_col=0,
                 parse_dates=['대여일시', '반납일시'],
                 infer_datetime_format=True)
origin_data = df
df.head()

Unnamed: 0,대여스테이션,대여일시,반납스테이션,반납일시,이동거리,rental_time
47,1,2022-07-23 10:12:20,985,2022-07-23 10:20:39,500,499
48,985,2022-07-23 10:23:19,885,2022-07-23 10:29:18,100,359
49,985,2022-07-23 10:23:24,885,2022-07-23 10:29:30,100,366
50,885,2022-07-23 10:33:08,1,2022-07-23 10:39:16,500,368
51,885,2022-07-23 10:33:17,1,2022-07-23 10:39:25,500,368


### Filter top N station(Station -> CNU)

In [2]:
import numpy as np
rental_cnu = [1016, 369, 1015, 366, 368, 1014, 367]
def retain_from_index_array(target: pd.Series, index: np.ndarray) -> pd.Series:
    c = target.copy()
    for i, v in target.items():
        if v in index:      # if index array contains v
            c[i] = True     # retain this row.
        else:               # if not contains v
            c[i] = False    # remove this row.
    return c
filtered = origin_data.loc[retain_from_index_array(origin_data['반납스테이션'], rental_cnu)]
stat_dict = {}

def addToDict(d: dict, station: int):
    if not str(station) in d.keys():
        d[str(station)] = 1
    else:
        d[str(station)] += 1

filtered['대여스테이션'].apply(lambda x: addToDict(stat_dict, x))
x = stat_dict.keys()
y = stat_dict.values()

rentaldf = pd.DataFrame(data={'대여스테이션': x, '빈도': y})
rentaldf = rentaldf.sort_values(by='빈도', ascending=False)
top10rental = rentaldf.head(30)
top10rental

Unnamed: 0,대여스테이션,빈도
18,763,538
5,192,411
44,1016,401
3,85,366
17,765,334
63,1023,326
4,199,309
41,193,296
27,1052,234
6,1021,230


In [3]:
target_station = list(top10rental['대여스테이션'])
target_station.remove('1')  # 1 제거

### Read Weather Data

In [4]:
weather_data = pd.read_csv('postprocessed/merged_data2.csv')
                          # parse_dates=['날짜'],
                          # infer_datetime_format=True)
weather_data

Unnamed: 0,날짜,강수량(mm),평균기온(℃),평균풍속(m/s)
0,2022-05-04,0.0,16.7,2.1
1,2022-05-05,0.0,18.5,1.6
2,2022-05-06,0.0,19.6,1.5
3,2022-05-07,0.0,18.3,2.2
4,2022-05-08,0.9,17.1,2.8
...,...,...,...,...
388,2023-05-27,0.6,20.8,0.2
389,2023-05-28,34.7,20.2,1.0
390,2023-05-29,28.0,20.4,0.9
391,2023-05-30,0.0,21.6,2.3


### Restruct Data for training model

각 대여소에 대하여 새로운 데이터셋을 구성

In [5]:
import datetime
weather_data = pd.read_csv('postprocessed/merged_data2.csv')
                          # parse_dates=['날짜'],
                          # infer_datetime_format=True)

new_columns = ['요일', 'am/pm', '시간대', '대여량', '강수량', '기온', '풍속']
target_station.append(368)
for s in target_station:        # 각 스테이션에 대해서
    dataframe = pd.DataFrame(columns=new_columns)      # 새로운 DataFrame
    
    df_temp = df.loc[df['대여스테이션'] == int(s)]      # 대여스테이션이 s인 경우만 선택.
    day_from = df_temp.iloc[0]['대여일시'].date()       # 첫 기록 날짜를 저장(가장 먼 날짜).
    day_to = df_temp.iloc[-1]['대여일시'].date()        # 마지막 기록 날짜(가장 최근 날짜).
    # 해당 구간동안 1일마다 루프를 돈다.
    for i in pd.date_range(start=day_from, end=day_to, freq='D'):
        weekday = i.weekday()   # 현재 루프의 요일
        next_day = (i + datetime.timedelta(days=1)) # 현재 루프의 다음날
        time_list = np.zeros(24).astype(int)        # 시간대별 대여량을 저장하기 위한 0~23의 리스트
        # 현재 날짜 i와 다음날의 구간동안의 데이터를 뽑음.
        a = df_temp.loc[(df_temp['대여일시'] >= i) & (df_temp['대여일시'] < next_day)]
        # 현재 날짜 i의 날씨를 가져옴.
        weather = weather_data.loc[weather_data['날짜'] == str(i.date())]
        weather = weather.iloc[0]
        
        for cell in a['대여일시']:  # 각각의 데이터의 대여일시에 관해서
            # print(i, next_day, a)
            time_list[cell.hour] += 1   # 시간대에 +1
        for n, t in enumerate(time_list):   # 0~23시 대여수량에 관해서
            isPm = 0 if n < 12 else 1       # 시간이 12시 미만이면 0, 이상이면 1
            ttime = n-12 if isPm else n     # 오후면 -12, 오전이면 그대로.
#             new_data = {
#                 '요일': weekday,
#                 'am/pm': isPm,
#                 '시간대': ttime,
#                 '대여량': t,
#                 '강수량': weather['가우량(mm)'],
#                 '풍속': weather['평균풍속(m/s)'],
#                 '기온': weather['평균기온(℃)']
#             }
            # 새로운 데이터
            new_data = [weekday, isPm, ttime, t, weather['강수량(mm)'], weather['평균기온(℃)'], weather['평균풍속(m/s)']]
            newdf = pd.DataFrame([new_data], columns=new_columns)
            # dataframe.append(pd.Series(new_data, index=new_columns), ignore_index=True)
            dataframe = pd.concat([dataframe, newdf], ignore_index=True)    # 새 데이터 삽입.
    dataframe.to_csv(f'{s}_data.csv')   # 현재 스테이션의 데이터를 저장.
dataframe

Unnamed: 0,요일,am/pm,시간대,대여량,강수량,기온,풍속
0,4,0,0,0,3.5,28.7,1.8
1,4,0,1,0,3.5,28.7,1.8
2,4,0,2,0,3.5,28.7,1.8
3,4,0,3,0,3.5,28.7,1.8
4,4,0,4,0,3.5,28.7,1.8
...,...,...,...,...,...,...,...
3547,4,1,7,0,0.0,-1.1,1.2
3548,4,1,8,0,0.0,-1.1,1.2
3549,4,1,9,0,0.0,-1.1,1.2
3550,4,1,10,0,0.0,-1.1,1.2


### Training example... to be modified.

In [6]:
from sklearn.linear_model import LinearRegression
x = dataframe.drop(columns=['대여량'])
y = dataframe['대여량']
model = LinearRegression()
model.fit(x, y)
model.predict([[0, 0, 10, 0, 23, 0]])



array([0.18226222])