In [1]:
import pandas as pd
df = pd.read_csv("./musinsa_data.csv")

In [2]:
df.head()

Unnamed: 0,temp,c_number,c_name,gender,height(cm),weight(kg),fit,link,size,c_result
0,28,0,레이어드 슬리브리스 탑_일반 기장 [화이트],남자,173,65,적당함,//image.msscdn.net/images/goods_img/20210825/2...,M,M0
1,28,0,레이어드 슬리브리스 탑_일반 기장 [화이트],남자,184,80,적당함,//image.msscdn.net/images/goods_img/20210825/2...,L,L0
2,28,0,레이어드 슬리브리스 탑_일반 기장 [화이트],남자,172,63,적당함,//image.msscdn.net/images/goods_img/20210825/2...,S,S0
3,28,0,레이어드 슬리브리스 탑_일반 기장 [화이트],남자,182,78,적당함,//image.msscdn.net/images/goods_img/20210825/2...,L,L0
4,28,0,레이어드 슬리브리스 탑_일반 기장 [화이트],남자,173,71,적당함,//image.msscdn.net/images/goods_img/20210825/2...,M,M0


### 결측치 확인

In [3]:
df = df[['temp', 'c_number', 'gender', 'height(cm)', 'weight(kg)', 'fit', 'size', 'c_result']]
df.isnull().sum()

temp          0
c_number      0
gender        0
height(cm)    0
weight(kg)    0
fit           0
size          0
c_result      0
dtype: int64

### 원핫인코딩

In [4]:
# Perform one-hot encoding on 'gender' and 'fit' columns
df_encoded = pd.get_dummies(df, columns=['gender', 'fit'])

# Display the first few rows of the dataframe
df_encoded.head()


Unnamed: 0,temp,c_number,height(cm),weight(kg),size,c_result,gender_남자,gender_여자,fit_작음,fit_적당함,fit_큼
0,28,0,173,65,M,M0,1,0,0,1,0
1,28,0,184,80,L,L0,1,0,0,1,0
2,28,0,172,63,S,S0,1,0,0,1,0
3,28,0,182,78,L,L0,1,0,0,1,0
4,28,0,173,71,M,M0,1,0,0,1,0


### 표준화 

In [5]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Perform label encoding on 'size' and 'c_result' columns
le = LabelEncoder()
df_encoded['size'] = le.fit_transform(df_encoded['size'])
df_encoded['c_result'] = le.fit_transform(df_encoded['c_result'])

# Define the columns to be standardized
cols_to_scale = ['temp', 'c_number', 'height(cm)', 'weight(kg)']

# Perform standardization on the selected columns
scaler = StandardScaler()
df_encoded[cols_to_scale] = scaler.fit_transform(df_encoded[cols_to_scale])

# Display the first few rows of the dataframe
df_encoded.head()


Unnamed: 0,temp,c_number,height(cm),weight(kg),size,c_result,gender_남자,gender_여자,fit_작음,fit_적당함,fit_큼
0,1.652993,-1.743339,0.150039,-0.109088,1,39,1,0,0,1,0
1,1.652993,-1.743339,1.524469,1.120555,0,0,1,0,0,1,0
2,1.652993,-1.743339,0.025091,-0.273041,2,79,1,0,0,1,0
3,1.652993,-1.743339,1.274572,0.956602,0,0,1,0,0,1,0
4,1.652993,-1.743339,0.150039,0.382769,1,39,1,0,0,1,0


### 훈련세트 테스트 세트 생성, 훈련 및 성능 평가 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into features (X) and target (y)
X = df_encoded.drop('c_result', axis=1)
y = df_encoded['c_result']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Define the classifiers
classifiers = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42)
}

# Initialize a dictionary to hold the results
results = {}

# For each classifier
for clf_name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate the performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Store the results
    results[clf_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

# Convert the results to a DataFrame
df_results = pd.DataFrame(results).T

df_results


In [7]:
# Define the classifiers with updated parameters
classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=50, max_depth=3, random_state=42),
    'Logistic Regression': LogisticRegression(solver='liblinear', max_iter=1000, random_state=42)
}

# Initialize a dictionary to hold the results
results = {}

# For each classifier
for clf_name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train, y_train)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate the performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    
    # Store the results
    results[clf_name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1}

# Convert the results to a DataFrame
df_results = pd.DataFrame(results).T

df_results


Unnamed: 0,Accuracy,Precision,Recall,F1 Score
Random Forest,0.670351,0.627137,0.670351,0.611963
Gradient Boosting,0.331599,0.311318,0.331599,0.307708
Logistic Regression,0.291938,0.269079,0.291938,0.226232
