<a href="https://colab.research.google.com/github/Choijina/Python/blob/main/%EC%99%80%EC%9D%B8%EC%98%88%EC%B8%A1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import joblib

import warnings
warnings.filterwarnings(action='ignore')

#train, test 자료 불러오기
train = pd.read_csv("./train.csv")
train= train.drop('index', axis=1) #필요 없는 값 삭제
train= train.drop('quality', axis=1) #필요 없는 값 삭제

test = pd.read_csv("./test.csv")
test= test.drop('index', axis=1) #필요 없는 값 삭제

In [None]:
#type을 수치형으로 변환하기 위해서 각각 0,1로 변환
enc = LabelEncoder()
enc.fit(train['type'])
train['type'] = enc.transform(train['type'])
test['type'] = enc.transform(test['type'])

In [None]:
#결측치 확인
def check_missing_col(train):
  missing_col = []
  counted_missing_col = 0
  for i, col in enumerate(train.columns):
      missing_values = sum(train[col].isna())
      is_missing = True if missing_values >= 1 else False
      if is_missing:
          counted_missing_col += 1
          print(f'결측치가 있는 컬럼은: {col}입니다.')
          print(f'해당 컬럼에 총 {missing_values}개의 결측치가 있습니다.')
          missing_col.append([col, train[col].dtype])
  if counted_missing_col == 0:
      print('데이터에 결측치가 존재하지 않습니다.')
  return missing_col
print('-train')
missing_col = check_missing_col(train)
print('\n-test')
missing_col = check_missing_col(test)

-train
데이터에 결측치가 존재하지 않습니다.

-test
데이터에 결측치가 존재하지 않습니다.


In [None]:
#train의 변수 간 상관관계(상관계수) 확인
plt.figure(figsize=(11,11))
sns.heatmap(data = train.corr(), annot=True, cmap='Greens')

**통계량 시각화**

In [None]:
plt.figure(figsize=(12,12))
for i in range(0,12):
    plt.subplot(3,4,i+1)
    sns.distplot(train.iloc[:,i])
plt.tight_layout()
plt.show()

In [None]:
#mean
train.mean().plot.bar()
plt.show()

In [None]:
#min
train.min().plot.bar()
plt.show()

In [None]:
#max
train.max().plot.bar()
plt.show()

In [None]:
#바이올린 플롯(Violin plot)
fig, axes = plt.subplots(4, 3, figsize=(25, 15))

fig.suptitle('feature distributions per type', fontsize= 40)
for ax, col in zip(axes.flat, train.columns[0:]):
    sns.violinplot(x= 'type' , y= col, ax=ax, data=train)
    ax.set_title(col, fontsize=20)
plt.tight_layout()
plt.show()

**변수들 간 type 분포 / Scatterplot으로 시각화**

In [None]:
#x='density', y='residual sugar
fig, ax = plt.subplots(figsize=(8, 8))
sns.scatterplot(x='density', y='residual sugar', hue='type', data=train, ax=ax)
ax.set_title('Density & residual sugar scatter plot', fontsize=20)
ax.set_xlabel('Density', fontsize=16)
ax.set_ylabel('residual sugar', fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.show()

In [None]:
#x='total sulfur dioxide', y='residual sugar'
fig, ax = plt.subplots(figsize=(8, 8))
sns.scatterplot(x='total sulfur dioxide', y='residual sugar', hue='type', data=train, ax=ax)
ax.set_title('Total sulfur dioxide & residual sugar scatter plot', fontsize=20)
ax.set_xlabel('Total sulfur dioxide', fontsize=16)
ax.set_ylabel('residual sugar', fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.show()

In [None]:
#x='total sulfur dioxide', y='volatile acidity'
fig, ax = plt.subplots(figsize=(8, 8))
sns.scatterplot(x='total sulfur dioxide', y='volatile acidity', hue='type', data=train, ax=ax)
ax.set_title('Total & volatile acidity scatter plot', fontsize=20)
ax.set_xlabel('Total sulfur dioxide', fontsize=16)
ax.set_ylabel('volatile acidity', fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.show()

In [None]:
#x='total sulfur dioxide', y='free sulfur dioxide'
fig, ax = plt.subplots(figsize=(8, 8))
sns.scatterplot(x='total sulfur dioxide', y='free sulfur dioxide', hue='type', data=train, ax=ax)
ax.set_title('Total & Free sulfur dioxide scatter plot', fontsize=20)
ax.set_xlabel('Total sulfur dioxide', fontsize=16)
ax.set_ylabel('Free sulfur dioxide', fontsize=16)
ax.tick_params(axis='both', which='major', labelsize=14)
plt.show()

**전처리 과정**

In [None]:
#train 표준화
transform_data = train.drop(columns=['type'])
scaler = StandardScaler()
std_transform_data = scaler.fit_transform(transform_data)
train[transform_data.columns] = std_transform_data #train의 해당 열만 대체되고, 'type' 그대로 유지

#test 표준화
transform_testdata = test.drop(columns=['type'])
std_transform_testdata = scaler.transform(transform_testdata)
test[transform_testdata.columns] = std_transform_testdata #train의 해당 열만 대체되고, 'type' 그대로 유지

In [None]:
train.head()

In [None]:
red_train = train[train['type'] == 0] #'type'에서 'red'인 값 저장
white_train = train[train['type'] == 1] #'type'에서 'white'인 값 저장

red_test = test[test['type'] == 0] #'type'에서 'red'인 값 저장
white_test = test[test['type'] == 1] #'type'에서 'white'인 값 저장

In [None]:
#red의 이상치 값 확인
plt.figure(figsize=(30, 30))
for col_idx in range(len(red_train.columns)):
    plt.subplot(6, 2, col_idx + 1)
    plt.boxplot(red_train[red_train.columns[col_idx]], flierprops=dict(markerfacecolor='r', marker='o'))
    plt.title("Feature: " + red_train.columns[col_idx], fontsize=25)
    plt.tight_layout()

plt.show()

In [None]:
#white의 이상치 값 확인
plt.figure(figsize=(30, 30))
for col_idx in range(len(white_train.columns)):
    plt.subplot(6, 2, col_idx + 1)
    plt.boxplot(white_train[white_train.columns[col_idx]], flierprops=dict(markerfacecolor='r', marker='o'))
    plt.title("Feature: " + white_train.columns[col_idx], fontsize=25)
    plt.tight_layout()

plt.show()

In [None]:
#train의 이상치 값 확인
plt.figure(figsize=(30, 30))
for col_idx in range(len(train.columns)):
    plt.subplot(6, 2, col_idx + 1)
    plt.boxplot(train[train.columns[col_idx]], flierprops=dict(markerfacecolor='r', marker='o'))
    plt.title("Feature: " + train.columns[col_idx], fontsize=25)
    plt.tight_layout()

plt.show

In [None]:
#이상치 제거
def remove_outlier(input_data):
    numeric_columns = input_data.select_dtypes(include=np.number).columns
    for column in numeric_columns:
        if column != 'type':
            q1 = input_data[column].quantile(0.25)
            q3 = input_data[column].quantile(0.75)
            iqr = q3 - q1
            minimum = q1 - (iqr * 1.5)
            maximum = q3 + (iqr * 1.5)
            input_data = input_data[(minimum < input_data[column]) & (input_data[column] < maximum)]
    return input_data

red_rmv = remove_outlier(red_train)
white_rmv = remove_outlier(white_train)
red_test_rmv =remove_outlier(red_test)
white_test_rmv =remove_outlier(white_test)

In [None]:
train = pd.concat([red_rmv, white_rmv], axis = 0)
test= pd.concat([red_test_rmv, white_test_rmv], axis = 0)

**모델링**

In [None]:
#RandomForest - 기본 모델. 하이퍼파라미터 튜닝 진행 X
# train 데이터와 test 데이터를 스플릿
X = train.drop('type', axis=1)
y = train['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 초기화 및 학습
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# 데이터 예측
y_pred = model.predict(X_test)

# 데이터 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print("데이터 정확도:", accuracy)

# 예측 결과
print("Precision(정밀도):", precision_score(y_test, y_pred))
print("Recall(재현율):", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

데이터 정확도: 0.9976851851851852
Precision(정밀도): 0.9984871406959153
Recall(재현율): 0.9984871406959153
F1-Score: 0.9984871406959153


In [None]:
# 하이퍼파라미터 후보 정의
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4],
}

# 그리드 탐색 수행
grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터 조합
print("최적의 하이퍼파라미터:", grid_search.best_params_)

최적의 하이퍼파라미터: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 6, 'n_estimators': 200}


In [None]:
#RandomForest - 하이퍼파라미터 튜닝 진행
# train 데이터와 test 데이터를 스플릿
X = train.drop('type', axis=1)
y = train['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 초기화 및 학습
model = RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100)
model.fit(X_train, y_train)

import pickle
with open('2021084047_최진아.model', 'wb') as fw:
  pickle.dump(model,fw)

In [None]:
from google.colab import files
uploaded_files = files.upload()

with open('2021084047_최진아.model', 'rb') as f:
  loaded_model=pickle.load(f)

Saving 2021084047_최진아.model to 2021084047_최진아 (1).model


In [None]:
#데이터 예측
y_pred = loaded_model.predict(X_test)

#데이터 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print("데이터 정확도:", accuracy)

#예측 결과
print("Precision(정밀도):", precision_score(y_test, y_pred))
print("Recall(재현율):", recall_score(y_test, y_pred))
print("F1-Score:", f1_score(y_test, y_pred))

데이터 정확도: 0.9976851851851852
Precision(정밀도): 0.9969834087481146
Recall(재현율): 1.0
F1-Score: 0.9984894259818731
