# 모듈

In [None]:
import datetime as dt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import seaborn as sns
from fredapi import Fred

from scipy import stats
from scipy.stats import shapiro

from sklearnex import patch_sklearn
patch_sklearn()

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA 

from sklearn.pipeline import Pipeline
from sklearn import svm

from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, recall_score, precision_score, adjusted_rand_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.linear_model import Lasso, Ridge, LinearRegression, ElasticNet

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

from statsmodels.stats.outliers_influence import variance_inflation_factor

import xgboost as xgb

In [None]:
# 한글 폰트 설정 (Windows 기준)
plt.rcParams['font.family'] = 'Malgun Gothic'  # 맑은 고딕
plt.rcParams['axes.unicode_minus'] = False  # 마이너스 기호 깨짐 방지

# 데이터 읽기

In [None]:
df_quarter = pd.read_excel('./new_data/new_test2.xlsx', sheet_name=0) # 분기별 데이터
df_month = pd.read_excel('./new_data/new_test2.xlsx', sheet_name=1) # 월별 데이터
df_q_add = pd.read_excel('./new_data/new_test2.xlsx', sheet_name=2)
df_mon_chg = pd.read_excel('./new_data/new_test2.xlsx', sheet_name=3)
df_month_omiss = pd.read_excel('./new_data/new_test2.xlsx', sheet_name=4)
df_month_2024 = pd.read_excel('./new_data/new_test2.xlsx', sheet_name=5) # 2024년 데이터(검증용)


# 상관관계 시각화

In [None]:
df_month_a = df_month.iloc[:, 5:]
df_month_corr = df_month_a.corr()[['wti']].sort_values(by = 'wti', ascending = True).T
fig, ax = plt.subplots(figsize=(18, 2))
plt.tight_layout()
sns.heatmap(df_month_corr, annot=True, fmt=".2f", yticklabels=False, 
            cbar=True, cbar_kws={"orientation": "horizontal", "pad": 0.2})
plt.title("WTI 가격과 다른 변수들의 피어슨 상관계수")
plt.show()

In [None]:
df_month.columns

# EDA 작업 수행

In [None]:
# filter = ['date', 'year', 'quarter', 'month', 'total_manu', 'dol_idx', 'm2',
#        'unemploy', 'interest', 'copper', 'cpi_purch_power', 'dspi', 'iron_ore',
#        'UMCSENT', 'us_spr', 'conflicts', 'world_production', 'opec_production',
#        'non_opec_producion', 'opec_pro_ratio', 'world_consumption',
#        'oecd_consumption', 'china_consumption', 'china_con_ratio', 'pro-con',
#        'wti', 'wti_change']

# filter = ['date', 'year', 'quarter', 'month', 'dol_idx', 'm2',
#        'unemploy', 'interest', 'copper', 
#        'UMCSENT', 'us_spr', 'conflicts', 'opec_pro_ratio', 'world_consumption',
#        'china_con_ratio', 'pro-con', 'wti', 'wti_change'
#        ]

filter = ['date', 'year', 'quarter', 'month', 'm2',
       'unemploy', 'interest', 'copper', 
       'UMCSENT', 'conflicts', 
        'pro-con', 'wti', 'wti_change'
       ]

df_vif = df_month.dropna()[filter].iloc[:,4:-2] # 4번까지는 날짜 정보 / -2번: 유가 / -1번: 유가 오르고 내림 (0 or 1)

variance_inflation_factor(df_vif.values, 0)


# VIF 계산
vif_data = pd.DataFrame()
vif_data["feature"] = df_vif.columns
vif_data["VIF"] = [variance_inflation_factor(df_vif.values, i) for i in range(df_vif.shape[1])]

print(vif_data)

In [None]:
# 일반 선형 회귀 모델

df_month = df_month.dropna()[filter]
df_X = df_month.iloc[:,4:-2] # 날짜 및 wti 가격정보 제거
df_y = df_month.iloc[:,-2] # wti 가격 가져오기

# df_month_2024 = df_month_2024.dropna()
# df_X_validate = df_month_2024.iloc[:,4:-2] # 날짜 및 wti 가격정보 제거
# df_y_validate = df_month_2024.iloc[:,-2]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lm = LinearRegression()
lm.fit(X_train_scaled, y_train)
pred = lm.predict(X_test_scaled)

# 성능 평가 지표 계산
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

# 결과 출력
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")


df_plot = pd.DataFrame(pred, columns=['pred'])
df_plot['test'] = y_test.reset_index(drop=True)
df_plot.index = y_test.index
residuals = y_test - pred
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
sns.lineplot(df_plot, ax=ax1)
stats.probplot(residuals, plot=ax2, )
plt.show()

stat, p = shapiro(residuals)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# p-value가 0.05보다 크면 정규성을 따른다고 해석할 수 있습니다.

In [None]:
# LASSO 모델

# df_X = df_month.iloc[:,4:-2]
# df_y = df_month.iloc[:,-2]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
pred = lasso.predict(X_test_scaled)

# 성능 평가 지표 계산
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

# 결과 출력
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")

df_plot = pd.DataFrame(pred, columns=['pred'])
df_plot['test'] = y_test.reset_index(drop=True)
df_plot.index = y_test.index
residuals = y_test - pred
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
sns.lineplot(df_plot, ax=ax1)
stats.probplot(residuals, plot=ax2, )
plt.show()

stat, p = shapiro(residuals)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# p-value가 0.05보다 크면 정규성을 따른다고 해석할 수 있습니다.

In [None]:
# Ridge 모델

# df_X = df_month.iloc[:,4:-4]
# df_y = df_month.iloc[:,-2]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ridge = Ridge(alpha=1)
ridge.fit(X_train_scaled, y_train)
pred = ridge.predict(X_test_scaled)

# 성능 평가 지표 계산
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

# 결과 출력
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")

df_plot = pd.DataFrame(pred, columns=['pred'])
df_plot['test'] = y_test.reset_index(drop=True)
df_plot.index = y_test.index
residuals = y_test - pred
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
sns.lineplot(df_plot, ax=ax1)
stats.probplot(residuals, plot=ax2, )
plt.show()

stat, p = shapiro(residuals)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# p-value가 0.05보다 크면 정규성을 따른다고 해석할 수 있습니다.

In [None]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


# K-Fold 교차 검증 설정 (예: 5-Fold)
kf = KFold(n_splits=5, shuffle=False)

# 교차 검증 수행
scores = cross_val_score(ridge, df_X, df_y, cv=kf)

In [None]:
# Ridge 모델 / 한달 후 유가와 비교

df_month = df_month.dropna()[filter]

gap = 1

df_X = df_month.iloc[:,4:-2]
df_y = df_month.iloc[:,-2]

df_X = df_X.iloc[:-gap,:]
df_y = df_y.iloc[gap:]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

ridge_1m = Ridge(alpha=1)
ridge_1m.fit(X_train_scaled, y_train)
pred = ridge_1m.predict(X_test_scaled)

mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)

# 성능 평가 지표 계산
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

# 결과 출력
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")

df_plot = pd.DataFrame(pred, columns=['pred'])
df_plot['test'] = y_test.reset_index(drop=True)
df_plot.index = y_test.index
residuals = y_test - pred
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
sns.lineplot(df_plot, ax=ax1)
stats.probplot(residuals, plot=ax2, )
plt.show()

stat, p = shapiro(residuals)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# p-value가 0.05보다 크면 정규성을 따른다고 해석할 수 있습니다.

In [None]:
# LASSO 모델 / 한달 후 유가와 비교

# df_X = df_month.iloc[:-1,4:-1]
# df_y = df_month.iloc[1:,-1]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

lasso_1m = Lasso(alpha=0.1)
lasso_1m.fit(X_train_scaled, y_train)
pred = lasso_1m.predict(X_test_scaled)

mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)

# 성능 평가 지표 계산
mse = mean_squared_error(y_test, pred)
mae = mean_absolute_error(y_test, pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, pred)

# 결과 출력
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")
print(f"R-squared Score: {r2}")

df_plot = pd.DataFrame(pred, columns=['pred'])
df_plot['test'] = y_test.reset_index(drop=True)
df_plot.index = y_test.index
residuals = y_test - pred
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
sns.lineplot(df_plot, ax=ax1)
stats.probplot(residuals, plot=ax2, )
plt.show()

stat, p = shapiro(residuals)
print('Statistics=%.3f, p=%.3f' % (stat, p))
# p-value가 0.05보다 크면 정규성을 따른다고 해석할 수 있습니다.

In [None]:
df_result = pd.DataFrame(columns=['name'])
df_result['name'] = df_X.columns.tolist()
df_result['lasso'] = lasso.coef_.tolist()
df_result['lasso_1m'] = lasso_1m.coef_.tolist()
df_result['ridge'] = ridge.coef_.tolist()
df_result['ridge_1m'] = ridge_1m.coef_.tolist()
df_result

In [None]:
# 로지스틱 회귀 Ridge / 값 그대로

from sklearn.linear_model import LogisticRegression

df_month = df_month.dropna()[filter]

df_X = df_month.iloc[:,4:-2]
df_y = df_month.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ridge 로지스틱 회귀 모델 설정
ridge_lr = LogisticRegression(penalty='l2', C=1, random_state=17)  # C는 정규화의 역수; 낮을수록 더 강한 정규화
ridge_lr.fit(X_train_scaled, y_train)

y_pred = ridge_lr.predict(X_test_scaled)
y_pred_proba = ridge_lr.predict_proba(X_test_scaled)[:,1]

print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# LASSO 로지스틱 회귀 모델 설정  / 값 그대로

lasso_lr = LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state=2380971)
lasso_lr.fit(X_train_scaled, y_train)

y_pred = lasso_lr.predict(X_test_scaled)
y_pred_proba = lasso_lr.predict_proba(X_test_scaled)[:,1]

print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# 로지스틱 회귀 Ridge / 한달 후

from sklearn.linear_model import LogisticRegression

df_month = df_month.dropna()[filter]

gap = 1

df_X = df_month.iloc[:-gap,4:-2]
df_y = df_month.iloc[gap:,-1]


X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ridge 로지스틱 회귀 모델 설정
ridge_lr_1m = LogisticRegression(penalty='l2', C=1, random_state=17)  # C는 정규화의 역수; 낮을수록 더 강한 정규화
ridge_lr_1m.fit(X_train_scaled, y_train)

y_pred = ridge_lr_1m.predict(X_test_scaled)
y_pred_proba = ridge_lr_1m.predict_proba(X_test_scaled)[:,1]

print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# LASSO 로지스틱 회귀 모델 설정 / 한달 후

lasso_lr_1m = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=2380971)
lasso_lr_1m.fit(X_train_scaled, y_train)

y_pred = lasso_lr_1m.predict(X_test_scaled)
y_pred_proba = lasso_lr_1m.predict_proba(X_test_scaled)[:,1]

print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# Random Forest 분류 / 값 그대로대로

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

# 파이프라인 설정
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # 데이터 스케일링
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))  # 랜덤 포레스트 모델
])

param_grid = {
    'rf__n_estimators': [10, 50, 100, 200],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__bootstrap': [True, False]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)

grid_search.fit(X_train, y_train)

y_pred = grid_search.predict(X_test)
y_pred_proba = grid_search.predict_proba(X_test)[:,1]


print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

print("최적의 파라미터:", grid_search.best_params_)
print("최적의 점수:", grid_search.best_score_)

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
df_coef = pd.DataFrame()
df_coef['name'] = df_month.columns[4:-2]
df_coef['ridge'] = ridge_lr.coef_.tolist()[0]
df_coef['ridge_1m'] = ridge_lr_1m.coef_.tolist()[0]
df_coef['lasso'] = lasso_lr.coef_.tolist()[0]
df_coef['lasso_1m'] = lasso_lr_1m.coef_.tolist()[0]

df_coef

아래는 변화율로 분석 수행

In [None]:
# 로지스틱 회귀 Ridge / 변화율

from sklearn.linear_model import LogisticRegression

df_month = df_month.dropna()[filter]

df_X = df_month.iloc[:,4:-2]
df_y = df_month.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ridge 로지스틱 회귀 모델 설정
ridge_lr = LogisticRegression(penalty='l2', C=1, random_state=17)  # C는 정규화의 역수; 낮을수록 더 강한 정규화
ridge_lr.fit(X_train_scaled, y_train)

y_pred = ridge_lr.predict(X_test_scaled)
y_pred_proba = ridge_lr.predict_proba(X_test_scaled)[:,1]

print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# LASSO 로지스틱 회귀 / 변화율

lasso_lr = LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=17)
lasso_lr.fit(X_train_scaled, y_train)

y_pred = lasso_lr.predict(X_test_scaled)
y_pred_proba = lasso_lr.predict_proba(X_test_scaled)[:,1]

print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# 로지스틱 회귀 Ridge / 변화율 / 한달 후

df_month = df_month.dropna()[filter]

gap = 1

df_X = df_month.iloc[:-gap,4:-2]
df_y = df_month.iloc[gap:,-1]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Ridge 로지스틱 회귀 모델 설정
ridge_lr_1m = LogisticRegression(penalty='l2', C=1, random_state=17)  # C는 정규화의 역수; 낮을수록 더 강한 정규화
ridge_lr_1m.fit(X_train_scaled, y_train)

y_pred = ridge_lr_1m.predict(X_test_scaled)
y_pred_proba = ridge_lr_1m.predict_proba(X_test_scaled)[:,1]

print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr_1m.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# LASSO 로지스틱 회귀 / 한달 후 / 변화율 

lasso_lr_1m = LogisticRegression(penalty='l1', solver='liblinear', C=1, random_state=17)
lasso_lr_1m.fit(X_train_scaled, y_train)

y_pred = lasso_lr_1m.predict(X_test_scaled)
y_pred_proba = lasso_lr_1m.predict_proba(X_test_scaled)[:,1]

print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
df_coef = pd.DataFrame()
df_coef['name'] = df_month.columns[4:-2]
df_coef['ridge'] = ridge_lr.coef_.tolist()[0]
df_coef['ridge_1m'] = ridge_lr_1m.coef_.tolist()[0]
df_coef['lasso'] = lasso_lr.coef_.tolist()[0]
df_coef['lasso_1m'] = lasso_lr_1m.coef_.tolist()[0]

df_coef

In [None]:
# Random Forest 분류 / 1달 후

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.3, random_state=17, shuffle=False)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 파이프라인 설정
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # 데이터 스케일링
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42))  # 랜덤 포레스트 모델
])

param_grid = {
    'rf__n_estimators': [10, 50, 100, 200],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__bootstrap': [True, False]
}

grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=1, n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

y_pred = grid_search.predict(X_test_scaled)
y_pred_proba = grid_search.predict_proba(X_test_scaled)[:,1]


print("정확도:", accuracy_score(y_test, y_pred))
print("정밀도:", precision_score(y_test, y_pred))
print("재현도:", recall_score(y_test, y_pred))

print("최적의 파라미터:", grid_search.best_params_)
print("최적의 점수:", grid_search.best_score_)

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [None]:
# XGBoost

param_grid = {
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.3],
    'n_estimators': [100, 200, 300],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
}

# XGBoost 분류기 설정
xgb_clf = xgb.XGBClassifier(random_state=-17)

# GridSearchCV 설정
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

grid_search.fit(X_train_scaled, y_train)
print("최적의 파라미터:", grid_search.best_params_)
print("최적의 점수:", grid_search.best_score_)

# 최적의 모델로 예측
y_pred = grid_search.predict(X_test_scaled)
y_pred_proba = grid_search.predict_proba(X_test_scaled)[:,1]
print("테스트 세트 정확도:", accuracy_score(y_test, y_pred))

fig, (ax1, ax2) = plt.subplots(1,2, figsize=(10,5))

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=ridge_lr.classes_)
disp.plot(ax=ax1)

fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

ax2.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
ax2.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()