In [None]:
# Import required libraries
# 필요 라이브러리 임포트
import numpy as np
import pandas as pd

# MinMaxScaler for normalization
# 정규화를 위한 MinMaxScaler
from sklearn.preprocessing import MinMaxScaler

# Linear Regression for trend analysis
# 추세 분석을 위한 선형 회귀
from sklearn.linear_model import LinearRegression

# Calculate correlation coefficient and p-value
# 상관계수 및 p-value 계산
from scipy.stats import pearsonr

# Import Plotly for visualization
# 시각화를 위한 Plotly 임포트
import plotly.graph_objects as go

# Load GT data
# GT 데이터 불러오기
busanPortDf = pd.read_csv('../useData/busanport/busan_부산항(전체)_rawData.csv', encoding='utf-8', header=0)

# Select only necessary columns
# 필요한 열만 선택
busanPortGT = busanPortDf[['Year', 'GT(Gross Tonnage)']]

# Exclude 2025 data due to incompleteness (only up to Feb)
# 25년도 통계는 2월까지만 존재하므로 제외
busanPortGT.query('Year < 2025', inplace=True)

# Convert GT column to float by removing commas
# GT 열의 데이터를 쉼표 제거 후 float로 변환
busanPortGT['GT(Gross Tonnage)'] = busanPortGT['GT(Gross Tonnage)'].str.replace(',', '').astype(int)

# Load CT (cargo throughput) data
# CT(화물처리실적) 데이터 불러오기
busanPortCT = pd.read_csv('../useData/busanCT(화물처리실적).csv', encoding='utf-8', header=0)

# Calculate yearly sum of cargo throughput
# 연도별 화물처리실적 합계 계산
busanPortCT_yearly = busanPortCT.groupby('Year').sum().reset_index()[['Year', 'CT(Cargo Throughput)']]

# Load ship schedule data (2010-2015 and 2016-2024)
# 선박 입출항 스케줄 데이터 불러오기
busanPortShipSchedule1 = pd.read_csv('../useData/SinhangSchedule_rawData(01_01_2010-31_12_2015).csv', encoding='utf-8', header=0)
busanPortShipSchedule2 = pd.read_csv('../useData/SinhangSchedule_rawData(01_01_2016-31_12_2024).csv', encoding='utf-8', header=0)

# Extract year for correlation analysis
# 상관분석을 위한 연도 추출
busanPortShipSchedule1['Year'] = busanPortShipSchedule1['Enter Time'].apply(lambda x: str(x)[:4])
busanPortShipSchedule2['Year'] = busanPortShipSchedule2['Enter Time'].apply(lambda x: str(x)[:4])
busanPortShipSchedule1['Year'] = busanPortShipSchedule1['Year'].astype(int)
busanPortShipSchedule2['Year'] = busanPortShipSchedule2['Year'].astype(int)

# Extract month
# 월 추출
busanPortShipSchedule1['Month'] = busanPortShipSchedule1['Enter Time'].apply(lambda x: str(x)[5:7])
busanPortShipSchedule2['Month'] = busanPortShipSchedule2['Enter Time'].apply(lambda x: str(x)[5:7])
busanPortShipSchedule1['Month'] = busanPortShipSchedule1['Month'].astype(int)
busanPortShipSchedule2['Month'] = busanPortShipSchedule2['Month'].astype(int)

# Calculate difference between Out Time and Enter Time
# 입출항 시간 차이 계산
busanPortShipSchedule1['Stay Time'] = pd.to_datetime(busanPortShipSchedule1['Out Time']) - pd.to_datetime(busanPortShipSchedule1['Enter Time'])
busanPortShipSchedule2['Stay Time'] = pd.to_datetime(busanPortShipSchedule2['Out Time']) - pd.to_datetime(busanPortShipSchedule2['Enter Time'])

# Convert Stay Time to hours
# 체류시간을 시간 단위로 변환
busanPortShipSchedule1['Stay Time'] = round(busanPortShipSchedule1['Stay Time'].dt.total_seconds() / 3600, 1)
busanPortShipSchedule2['Stay Time'] = round(busanPortShipSchedule2['Stay Time'].dt.total_seconds() / 3600, 1)

# Filter data from 2013 onward for GT alignment
# 총 톤수 데이터에 맞춰 2013년 이후 데이터만 추출
busanPortShipSchedule1.query('Year >= 2013', inplace=True)

# Select only relevant columns
# 상관분석에 필요한 열만 선택
busanPortShipSchedule1 = busanPortShipSchedule1[['Year', 'Month', 'Stay Time']]
busanPortShipSchedule2 = busanPortShipSchedule2[['Year', 'Month', 'Stay Time']]

# Merge both schedule datasets
# 스케줄 데이터 병합
busanPortShipSchedule = pd.concat([busanPortShipSchedule1, busanPortShipSchedule2], axis=0, ignore_index=True)

# Group by Year and Month and sum Stay Time
# 연도, 월별로 그룹화 후 체류시간 합계 계산
busanPortStayTime = busanPortShipSchedule.groupby(['Year', 'Month']).agg({'Stay Time': 'sum'}).reset_index()

# Merge GT and CT data for yearly correlation
# 연도별 GT, CT 데이터 병합
busanPortCorrDf = pd.merge(busanPortGT, busanPortCT_yearly, on='Year')

# Save raw correlation data
# 원본 상관분석용 데이터 저장
busanPortCorrDf.to_csv('../useData/busanPortCorrRaw.csv', encoding='utf-8-sig')

# Merge monthly CT and Stay Time data
# 월별 CT와 체류시간 데이터 병합
busanPortCorrDf_monthly = pd.merge(busanPortCT, busanPortStayTime, on=['Year', 'Month'])

# Save monthly raw correlation data
# 월별 원본 상관분석 데이터 저장
busanPortCorrDf_monthly.to_csv('../useData/busanPortCorrMonthlyRaw.csv', encoding='utf-8-sig')

# Scalers for normalization
# 정규화를 위한 스케일러 정의
scaler_gt = MinMaxScaler()
scaler_ct1 = MinMaxScaler()
scaler_ct2 = MinMaxScaler()
scaler_stay_time = MinMaxScaler()

# Apply scalers to each column
# 각 열에 스케일러 적용
busanPortCorrDf['GT(Gross Tonnage)'] = scaler_gt.fit_transform(busanPortCorrDf[['GT(Gross Tonnage)']])
busanPortCorrDf['CT(Cargo Throughput)'] = scaler_ct1.fit_transform(busanPortCorrDf[['CT(Cargo Throughput)']])
busanPortCorrDf_monthly['CT(Cargo Throughput)'] = scaler_ct2.fit_transform(busanPortCorrDf_monthly[['CT(Cargo Throughput)']])
busanPortCorrDf_monthly['Stay Time'] = scaler_stay_time.fit_transform(busanPortCorrDf_monthly[['Stay Time']])

# Save scaled data
# 정규화된 데이터 저장
busanPortCorrDf.to_csv('../useData/busanPortCorrScaled.csv', encoding='utf-8-sig')
busanPortCorrDf_monthly.to_csv('../useData/busanPortCorrMonthlyScaled.csv', encoding='utf-8-sig')
