In [1]:
import pandas as pd
df = pd.read_csv('data1.csv')
df.head()

Unnamed: 0,temp,gender,height(cm),weight(kg),fit,c_result
0,28,여자,162,47,큼,M1
1,28,여자,170,55,적당함,S1
2,28,여자,162,55,큼,M1
3,28,여자,171,75,큼,L1
4,28,여자,167,50,큼,M1


In [2]:
df_encoded = pd.get_dummies(df,columns=['gender','fit'])
df_encoded.head()

Unnamed: 0,temp,height(cm),weight(kg),c_result,gender_남자,gender_여자,fit_작음,fit_적당함,fit_큼
0,28,162,47,M1,0,1,0,0,1
1,28,170,55,S1,0,1,0,1,0
2,28,162,55,M1,0,1,0,0,1
3,28,171,75,L1,0,1,0,0,1
4,28,167,50,M1,0,1,0,0,1


In [3]:
from sklearn.preprocessing import StandardScaler


cols_to_scale = ['temp', 'height(cm)', 'weight(kg)']
scaler = StandardScaler()

df_encoded[cols_to_scale] = scaler.fit_transform(df_encoded[cols_to_scale])

df_encoded.head()

Unnamed: 0,temp,height(cm),weight(kg),c_result,gender_남자,gender_여자,fit_작음,fit_적당함,fit_큼
0,1.609488,-1.202216,-1.567394,M1,0,1,0,0,1
1,1.609488,-0.189824,-0.910017,S1,0,1,0,1,0
2,1.609488,-1.202216,-0.910017,M1,0,1,0,0,1
3,1.609488,-0.063275,0.733426,L1,0,1,0,0,1
4,1.609488,-0.569471,-1.320877,M1,0,1,0,0,1


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [8]:
X = df_encoded.drop('c_result', axis=1)
y = df_encoded['c_result']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [9]:
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

results = {}

for clf_name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) # 양성 클래스 중 모델이 정확하게 양성으로 예측한 비율
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0) # 정밀도와 재현율을 조화평균을 나타내는 지표
    
    results[clf_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

df_results = pd.DataFrame(results).T

df_results

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Random Forest,0.746769,0.761821,0.746769,0.743627
Gradient Boosting,0.673913,0.694164,0.673913,0.670851
Decision Tree,0.794947,0.810401,0.794947,0.794506
SVM,0.300823,0.268607,0.300823,0.246042


## 오버샘플링

In [10]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42, k_neighbors=3)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [11]:
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'SVM': SVC(random_state=42)
}

results = {}

for clf_name, clf in classifiers.items():
    clf.fit(X_train_resampled, y_train_resampled)
    y_pred = clf.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) # 양성 클래스 중 모델이 정확하게 양성으로 예측한 비율
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0) # 정밀도와 재현율을 조화평균을 나타내는 지표
    
    results[clf_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}
    
df_results = pd.DataFrame(results).T

df_results

Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Random Forest,0.753231,0.773907,0.753231,0.753374
Gradient Boosting,0.609871,0.628065,0.609871,0.610455
Decision Tree,0.779671,0.800316,0.779671,0.780791
SVM,0.246769,0.321614,0.246769,0.234676
