In [1]:
# Term Project : Algorithm - Regression
# 2024-05-31

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import tree

sns.set(style='whitegrid')

file_path = '../00_Data/Integrated_data/Integration_b.csv'
data = pd.read_csv(file_path, index_col='Date')

encoding_dict = {'negative': -1, 'normal': 0, 'weak positive': 0.5, 'positive': 1}
data['Economic growth state'] = data['Economic growth state'].map(encoding_dict)
data = data.dropna()
# 데이터 확인
print(data.head())

# 독립 변수(X)와 종속 변수(y) 설정
X = data.drop(columns=['Traffic rate'])
y = data['Traffic rate']
# # NaN이 있는 row만 선택
# rows_with_nan = data[data.isnull().any(axis=1)]

# print(rows_with_nan)

# 데이터를 학습용과 테스트용으로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

########## Linear Regression

# 모델 생성 및 학습
model_linear = LinearRegression()
model_linear.fit(X_train, y_train)

# 테스트 데이터를 사용한 예측
y_pred_linear = model_linear.predict(X_test)

# print(y_pred_linear)

# 모델 성능 평가
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)

print("[Linear Regression] Mean Squared Error:", mse_linear)
print("[Linear Regression] R^2 Score:", r2_linear)

########## Visualize - Linear Regression

# 시각화
plt.figure(figsize=(14, 6))

# 산점도: 실제 값 vs 예측 값
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_linear, alpha=0.5)
plt.xlabel("Actual Traffic rate")
plt.ylabel("Predicted Traffic rate")
plt.title("[Linear Regression] Actual vs Predicted Traffic rate")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')

# 잔차 플롯
plt.subplot(1, 2, 2)
residuals_linear = y_test - y_pred_linear
plt.scatter(y_pred_linear, residuals_linear, alpha=0.5)
plt.xlabel("Predicted Traffic rate")
plt.ylabel("Residuals")
plt.title("[Linear Regression] Residuals Plot")
plt.axhline(0, color='red', linestyle='--')

plt.tight_layout()
plt.show()


############## Random Forest Regression (Ensemble method)

model_randomForest = RandomForestRegressor(n_estimators=100, random_state=42)
model_randomForest.fit(X_train, y_train)

y_pred_randomForest = model_randomForest.predict(X_test)

importances = model_randomForest.feature_importances_
names = model_randomForest.feature_names_in_

print(importances)
print(names)
estimator_list = model_randomForest.estimators_
# 피처(Feature) 중요도 시각화
sns.barplot(x=names , y=importances)
plt.title('[Random Forest] Feature Importance')
plt.show()

plt.figure(figsize=(20,20))
# tree.plot_tree(model_randomForest.estimators_[0], feature_names=X.columns, filled=True)

# 모델 성능 평가
mse_randomForest = mean_squared_error(y_test, y_pred_randomForest)
r2_randomForest = r2_score(y_test, y_pred_randomForest)

print("[Random Forest] Mean Squared Error:", mse_randomForest)
print("[Random Forest] R^2 Score:", r2_randomForest)


########## Visualize - Random Forest Regression (Ensemble method)

# 시각화
plt.figure(figsize=(14, 6))

# 산점도: 실제 값 vs 예측 값
plt.subplot(1, 2, 1)
plt.scatter(y_test, y_pred_randomForest, alpha=0.5)
plt.xlabel("Actual Traffic rate")
plt.ylabel("Predicted Traffic rate")
plt.title("[Random Forest] Actual vs Predicted Traffic rate")
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')

# 잔차 플롯
plt.subplot(1, 2, 2)
residuals_randomForest = y_test - y_pred_randomForest
plt.scatter(y_pred_randomForest, residuals_randomForest, alpha=0.5)
plt.xlabel("Predicted Traffic rate")
plt.ylabel("Residuals")
plt.title("[Random Forest] Residuals Plot")
plt.axhline(0, color='red', linestyle='--')

plt.tight_layout()
plt.show()

KeyError: 'Economic growth state'