# **라이브러리 로드**

In [None]:
from math import sqrt
import numpy as np
import pandas as pd

# 데이터 전처리 패키지
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# 모델 패키지
import statsmodels.api as sm
from sklearn.ensemble import RandomForestClassifier
from deap import base, creator, tools, algorithms

# 모델 평가 패키지
from sklearn.metrics import mean_squared_error, accuracy_score

# 시각화 패키지
import matplotlib.pyplot as plt

# **차원 축소 - 변수 선택법**

## **1. 기본 모델**

### **1-1. 데이터 불러오기 및 전처리**

ToyotaCorolla.csv
- 중고차 판매 데이터 셋
- 가격, 연식, 주행 거리, 연료 유형, 엔진 크기, 옵션 정보 등 여러 변수 포함

In [None]:
# 데이터 불러오기
data = pd.read_csv("ToyotaCorolla.csv")
data.head()

In [None]:
# 데이터 확인
data.info()

### **1-2. 데이터 분리**

In [None]:
# 데이터 전처리
# 'Price'를 종속 변수로 설정
X = data.drop(columns=['Price', 'Id', 'Model'])  # 독립 변수
y = data['Price']


# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=34)

# object변수 변환
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)


X_train = X_train.astype({col: 'int' for col in X_train.select_dtypes(include='bool').columns})
X_test = X_test.astype({col: 'int' for col in X_test.select_dtypes(include='bool').columns})
X_train.info()

In [None]:
# add_constant를 통해 상수항 생성
X_train = sm.add_constant(X_train)

# 모델 형성 및 결과 출력
model = sm.OLS(y_train, X_train).fit()
model.summary()


***

## **2. 전진선택법**

In [None]:
# X,y 정의
variables = X_train.columns.tolist() 
y = y_train 

# 선택된 변수들 list 생성
forward_variables = []

# 전진선택시 고려할 기준 p-value    
sl_enter = 0.05
sl_remove = 0.05

# 각 스텝별로 선택된 변수들
sv_per_step = [] 
# 각 스텝별 수정된 결정계수
adj_r_squared_list = []
# 스텝
steps = []
step = 0


while len(variables) > 0:
    remainder = list(set(variables) - set(forward_variables))
    pval = pd.Series(index=remainder) #p-value
    
    #model fitting
    for col in remainder: 
        X = X_train[forward_variables+[col]]
        X = sm.add_constant(X)
        forward_model = sm.OLS(y,X).fit(disp=0)
        pval[col] = forward_model.pvalues[col]
 
    min_pval = pval.min()
    if min_pval < sl_enter: #p-value기준으로 변수 선택 결정
        forward_variables.append(pval.idxmin())
        
        #변수 제거
        while len(forward_variables) > 0:
            selected_X = X_train[forward_variables]
            selected_X = sm.add_constant(selected_X)
            selected_pval = sm.OLS(y,selected_X).fit(disp=0).pvalues[1:]
            max_pval = selected_pval.max()
            if max_pval >= sl_remove:
                remove_variable = selected_pval.idxmax()
                forward_variables.remove(remove_variable)
            else:
                break
        
        step += 1
        steps.append(step)
        adj_r_squared = sm.OLS(y,sm.add_constant(X_train[forward_variables])).fit(disp=0).rsquared_adj
        adj_r_squared_list.append(adj_r_squared)
        sv_per_step.append(forward_variables.copy())
    else:
        break


#### **step별 선택된 변수와 r 값 시각화**

In [None]:
fig = plt.figure(figsize=(10,10))
fig.set_facecolor('white')
 
font_size = 15
plt.xticks(steps,[f'step {s}\n'+'\n'.join(sv_per_step[i]) for i,s in enumerate(steps)], fontsize=1)
plt.plot(steps, adj_r_squared_list, marker='o')
    
plt.ylabel('adj_r_squared',fontsize=font_size)
plt.grid(True)
plt.show()


In [None]:
#stepwise 사용한 모델
forward_model = sm.OLS(y_train, sm.add_constant(pd.DataFrame(X_train[forward_variables]))).fit(disp=0)
forward_model.summary()


***

## **3. 후진 소거법**

In [None]:
def backward_regression(X, y,
                           initial_list=[], 
                           threshold_out = 0.05, # 후진선택시 고려할 기준 p-value   
                           feature_list = X_train.columns.tolist()
                           ):
    
    
    sv_per_step = [] # 각 스텝별로 선택된 변수들
    adj_r_squared_list = [] # 각 스텝별 수정된 결정계수
    steps = [] # 스텝
    step = 0
    included = feature_list
    while True:
        changed=False
        model = sm.OLS(y, sm.add_constant(pd.DataFrame(X[feature_list]))).fit(disp=0)
        
        pvalues = model.pvalues.iloc[1:] #p-value
        worst_pval = pvalues.max()	# p-value값이 가장 높은 것 선택
        if worst_pval > threshold_out:
            changed=True
            worst_feature = pvalues.idxmax()
            included.remove(worst_feature)
        
        step += 1
        steps.append(step)        
        adj_r_squared = sm.OLS(y, sm.add_constant(pd.DataFrame(X[feature_list]))).fit(disp=0).rsquared_adj
        adj_r_squared_list.append(adj_r_squared)
        sv_per_step.append(included.copy())
        
        if not changed:
            break
      
    return included,step,steps,adj_r_squared_list,sv_per_step

backward_variables_function,step,steps,adj_r_squared_list,sv_per_step = backward_regression(X_train, y_train)


#### **Step별 선택된 변수와 r 값 시각화**

In [None]:
fig = plt.figure(figsize=(10,10))
fig.set_facecolor('white')
 
font_size = 15
plt.xticks(steps,[f'step {s}\n'+'\n'.join(sv_per_step[i]) for i,s in enumerate(steps)], fontsize=1)
plt.plot(steps, adj_r_squared_list, marker='o')
    
plt.ylabel('adj_r_squared',fontsize=font_size)
plt.grid(True)
plt.show()


In [None]:
# 후진선택법 사용한 모델
back_model = sm.OLS(y_train, sm.add_constant(pd.DataFrame(X_train[backward_variables_function]))).fit(disp=0)
back_model.summary()

***

## **4. stepwise**

In [None]:
def stepwise_feature_selection(X_train, y_train, variables=X_train.columns.tolist() ):
    
    y = y_train 

    selected_variables = [] # 선택된 변수들
    
    # stepwise시 고려할 기준 p-value  
    sl_enter = 0.05 
    sl_remove = 0.05
    
    sv_per_step = [] # 각 스텝별로 선택된 변수들
    adjusted_r_squared = [] # 각 스텝별 수정된 결정계수
    steps = [] # 스텝
    step = 0
    while len(variables) > 0:
        remainder = list(set(variables) - set(selected_variables))
        pval = pd.Series(index=remainder) # p-value
        
        #model fitting
        for col in remainder: 
            X = X_train[selected_variables+[col]]
            X = sm.add_constant(X)
            model = sm.OLS(y,X).fit(disp=0)
            pval[col] = model.pvalues[col]
    
        min_pval = pval.min()
        if min_pval < sl_enter: # p-value 기준으로 변수 선택
            selected_variables.append(pval.idxmin())
            
            #변수 제거
            while len(selected_variables) > 0:
                selected_X = X_train[selected_variables]
                selected_X = sm.add_constant(selected_X)
                selected_pval = sm.OLS(y,selected_X).fit(disp=0).pvalues[1:]
                max_pval = selected_pval.max()
                if max_pval >= sl_remove:
                    remove_variable = selected_pval.idxmax()
                    selected_variables.remove(remove_variable)
                else:
                    break
            
            step += 1
            steps.append(step)
            adj_r_squared = sm.OLS(y,sm.add_constant(X_train[selected_variables])).fit(disp=0).rsquared_adj
            adjusted_r_squared.append(adj_r_squared)
            sv_per_step.append(selected_variables.copy())
        else:
            break

    #시각화
    fig = plt.figure(figsize=(100,10))
    fig.set_facecolor('white')
    
    font_size = 15
    plt.xticks(steps,[f'step {s}\n'+'\n'.join(sv_per_step[i]) for i,s in enumerate(steps)], fontsize=12)
    plt.plot(steps,adjusted_r_squared, marker='o')
      
    plt.ylabel('Adjusted R Squared',fontsize=font_size)
    plt.grid(True)
    plt.show()

    return selected_variables
    

selected_variables = stepwise_feature_selection(X_train, y_train)


In [None]:
#stepwise 사용한 모델
stepwise_model = sm.OLS(y_train, sm.add_constant(pd.DataFrame(X_train[selected_variables]))).fit(disp=0)
stepwise_model.summary()


***

## **성능 비교(RMSE)**

In [None]:
#full featrue
y_pred = model.predict(X_test)

rmse = sqrt(mean_squared_error(y_test, y_pred))
print("full_rmse : ",rmse)

#forward featrue
y_pred = forward_model.predict(X_test[forward_variables])

rmse = sqrt(mean_squared_error(y_test, y_pred))
print("forward_rmse : ",rmse)

#backward featrue
y_pred = back_model.predict(X_test[backward_variables_function])

rmse = sqrt(mean_squared_error(y_test, y_pred))
print("back_rmse : ",rmse)

#stepwise featrue
y_pred = stepwise_model.predict(X_test[selected_variables])

rmse = sqrt(mean_squared_error(y_test, y_pred))
print("stepwise_rmse : ",rmse)


***
## **5. 유전 알고리즘**

PCOS_data.csv
- 여성 호르몬 장애 환자들에 대한 데이터 셋
- 호르몬 이상을 경험한 환자들의 체내 특성 정보 포함

In [None]:
def preprocess_data(data):
    # Identify categorical columns
    categorical_columns = data.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        data[col] = LabelEncoder().fit_transform(data[col])
    return data

In [None]:
data = pd.read_csv("PCOS_data.csv")
data = preprocess_data(data)
# Display the first few rows of the dataset to understand its structure
data.head()

In [None]:
# 데이터 전처리
# 'PCOS (Y/N)'을 종속 변수로 설정
X = data.drop(columns=['PCOS (Y/N)', 'Sl. No', 'Patient File No.', 'Unnamed: 44'], errors='ignore')  # 독립 변수
y = data['PCOS (Y/N)']


# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X.info()

In [None]:
# 유전 알고리즘 설정
# Fitness 함수 정의 (정확도를 최대화하는 문제)
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

In [None]:
# 유전자의 표현 (특성 선택 여부를 0 또는 1로 표시)
toolbox.register("attr_bool", np.random.randint, 2)

# 개체 생성 (특성의 수만큼 0 또는 1로 이루어진 리스트)
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=X.shape[1])

# 개체군 생성
toolbox.register("population", tools.initRepeat, list, toolbox.individual)


In [None]:
# 평가 함수 정의
def evaluate(individual):
    # 선택된 특성만으로 모델 학습 및 평가
    selected_features = [index for index, value in enumerate(individual) if value == 1]
    if len(selected_features) == 0:  # 특성을 하나도 선택하지 않은 경우 패널티 부여
        return 0.0,
    
    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]
    
    model = RandomForestClassifier(random_state=42)
    model.fit(X_train_selected, y_train)
    predictions = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, predictions)
    return accuracy,

toolbox.register("evaluate", evaluate)

# 교배, 변이, 선택 연산 정의
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)


In [None]:
initial_model = RandomForestClassifier(random_state=42)
initial_model.fit(X_train, y_train)
initial_predictions = initial_model.predict(X_test)
initial_accuracy = accuracy_score(y_test, initial_predictions)
print(f"Initial accuracy (all features): {initial_accuracy:.4f}")

In [None]:
# 유전 알고리즘으로 변수 선택 수행
population = toolbox.population(n=50)  # 초기 개체군 크기
ngen = 20  # 세대 수
cxpb = 0.5  # 교배 확률
mutpb = 0.2  # 변이 확률

# 통계 정보 출력 설정
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("min", np.min)
stats.register("max", np.max)

population, logbook = algorithms.eaSimple(
    population, toolbox, cxpb, mutpb, ngen, stats=stats, verbose=True
)

# 최적의 해 찾기
best_individual = tools.selBest(population, k=1)[0]
selected_features = [index for index, value in enumerate(best_individual) if value == 1]
print(f"Best individual: {best_individual}")
print(f"Selected features: {selected_features}")

In [None]:
# 선택된 특성을 사용한 모델 정확도 계산
if len(selected_features) > 0:
    X_train_selected = X_train.iloc[:, selected_features]
    X_test_selected = X_test.iloc[:, selected_features]

    final_model = RandomForestClassifier(random_state=42)
    final_model.fit(X_train_selected, y_train)
    final_predictions = final_model.predict(X_test_selected)
    final_accuracy = accuracy_score(y_test, final_predictions)
    print(f"Final accuracy (selected features): {final_accuracy:.4f}")
else:
    final_accuracy = 0.0
    print("No features were selected.")

In [None]:
improvement = final_accuracy - initial_accuracy
print(f"Accuracy improvement: {improvement:.4f}")