## 필수 라이브러리 및 data load

In [4]:
import os
from pprint import pprint

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import Counter

from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix, f1_score,
    precision_score, recall_score
)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer

from imblearn.over_sampling import SMOTE


In [5]:
from google.colab import drive
drive.mount('/content/drive')
train_data = pd.read_csv("/content/drive/MyDrive/쿠글 1차 프로젝트/test.csv")

Mounted at /content/drive


In [6]:
print(train_data.shape)

(2077964, 21)


## 결측치 50% 이상 제거 / 중복행 제거 / 레이블 인코딩

In [7]:
# 결측값 50% 이상 및 id 컬럼 제거
columns_to_drop = ['stem-root', 'stem-surface', 'veil-type', 'veil-color', 'spore-print-color', 'id']

# train_data에서 해당 컬럼 삭제
train_data = train_data.drop(columns=columns_to_drop)

print(train_data.shape)

(2077964, 15)


In [8]:
label_encoder = LabelEncoder()

# 'class'와 'season' 컬럼에 레이블 인코딩 적용
train_data['season'] = label_encoder.fit_transform(train_data['season'])

# 결과 확인
print(train_data['season'].head())

0    0
1    0
2    1
3    2
4    2
Name: season, dtype: int64


## data information

In [9]:
display(train_data.head(20))

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season
0,8.64,x,,n,t,,,w,11.13,17.12,w,t,g,d,0
1,6.9,o,t,o,f,,c,y,1.27,10.75,n,f,f,d,0
2,2.0,b,g,n,f,,c,n,6.18,3.14,n,f,f,d,1
3,3.47,x,t,n,f,s,c,n,4.98,8.51,w,t,z,d,2
4,6.17,x,h,y,f,p,,y,6.73,13.7,y,t,,d,2
5,4.43,x,h,n,f,x,c,n,5.36,5.5,n,t,r,d,0
6,2.92,x,d,n,f,p,,e,4.83,10.27,y,f,f,d,0
7,2.59,o,,k,f,f,f,f,2.73,12.71,g,f,f,d,0
8,4.13,x,t,o,f,a,c,n,5.36,6.59,o,t,z,d,3
9,11.91,f,e,b,f,,c,b,5.32,20.2,w,t,f,d,0


In [10]:
# 타입
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2077964 entries, 0 to 2077963
Data columns (total 15 columns):
 #   Column                Dtype  
---  ------                -----  
 0   cap-diameter          float64
 1   cap-shape             object 
 2   cap-surface           object 
 3   cap-color             object 
 4   does-bruise-or-bleed  object 
 5   gill-attachment       object 
 6   gill-spacing          object 
 7   gill-color            object 
 8   stem-height           float64
 9   stem-width            float64
 10  stem-color            object 
 11  has-ring              object 
 12  ring-type             object 
 13  habitat               object 
 14  season                int64  
dtypes: float64(3), int64(1), object(11)
memory usage: 237.8+ MB


In [11]:
# 고유값
unique_counts = train_data.nunique()
print(unique_counts)

# label encoding = 'class' 'season'
# binary encoding = 그 외 컬럼
# 정규화/표준화_float type이니까 = 'cap-diameter', 'stem-height', 'stem-width'

cap-diameter            3745
cap-shape                 62
cap-surface               59
cap-color                 57
does-bruise-or-bleed      22
gill-attachment           66
gill-spacing              35
gill-color                56
stem-height             2664
stem-width              5610
stem-color                55
has-ring                  23
ring-type                 36
habitat                   39
season                     4
dtype: int64


In [12]:
# null 값
null_columns = train_data.isnull().sum()

# null 값이 있는 컬럼만 필터링
null_columns_with_data = null_columns[null_columns > 0]

# null 값이 있는 컬럼과 해당 null 값의 개수를 출력
print("Null 값이 있는 컬럼과 그 개수:")
print(null_columns_with_data)

# null 값이 있는 컬럼의 수를 출력
print(f"\nNull 값이 있는 컬럼의 수: {len(null_columns_with_data)}")

Null 값이 있는 컬럼과 그 개수:
cap-diameter                 7
cap-shape                   31
cap-surface             446904
cap-color                   13
does-bruise-or-bleed        10
gill-attachment         349821
gill-spacing            839595
gill-color                  49
stem-height                  1
stem-color                  21
has-ring                    19
ring-type                86195
habitat                     25
dtype: int64

Null 값이 있는 컬럼의 수: 13


## 결측치 대체

### float type

In [13]:
# 각 컬럼의 skewness(왜도) 계산
skewness_values = train_data[['cap-diameter', 'stem-height', 'stem-width']].skew()
print(skewness_values)

cap-diameter    4.957161
stem-height     1.921951
stem-width      1.249341
dtype: float64


In [14]:
# skewness > 0.5 이므로 'cap-diameter', 'stem-height' 컬럼의 결측치를 중앙값 대체
median_value = train_data['cap-diameter'].median()
train_data['cap-diameter'].fillna(median_value, inplace=True)

median_value = train_data['stem-height'].median()
train_data['stem-height'].fillna(median_value, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['cap-diameter'].fillna(median_value, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_data['stem-height'].fillna(median_value, inplace=True)


### object type

In [15]:
# 범주형 변수들 중에서 고유값의 하위 1%를 'Unknown'으로 변환하는 함수
def replace_infrequent_categories(df, column, threshold=0.01):
    # 각 고유값의 빈도 계산
    value_counts = df[column].value_counts(normalize=True)

    # 하위 1%에 해당하는 고유값 추출
    infrequent = value_counts[value_counts <= threshold].index

    # 해당 고유값들을 'Unknown'으로 대체
    df[column] = df[column].apply(lambda x: 'Unknown' if x in infrequent else x)

    return df

# train_data에서 object 타입의 모든 컬럼에 대해 Unknown 처리
for col in train_data.select_dtypes(include=['object']).columns:
    train_data = replace_infrequent_categories(train_data, col, threshold=0.05)

# 결과 확인
print(train_data.head())

   cap-diameter cap-shape cap-surface cap-color does-bruise-or-bleed  \
0          8.64         x         NaN         n                    t   
1          6.90   Unknown           t         o                    f   
2          2.00         b           g         n                    f   
3          3.47         x           t         n                    f   
4          6.17         x           h         y                    f   

  gill-attachment gill-spacing gill-color  stem-height  stem-width stem-color  \
0             NaN          NaN          w        11.13       17.12          w   
1             NaN            c          y         1.27       10.75          n   
2             NaN            c          n         6.18        3.14          n   
3               s            c          n         4.98        8.51          w   
4               p          NaN          y         6.73       13.70          y   

  has-ring ring-type habitat  season  
0        t   Unknown       d       0  
1 

In [16]:
# 'Unknown'이라는 문자열로 대체 후 원본 데이터 유지
object_columns = train_data.select_dtypes(include=['object']).columns

# 각 object 타입 컬럼의 결측값을 'Unknown'으로 대체
train_data[object_columns] = train_data[object_columns].fillna('Unknown')

# 결과 출력
print(train_data[object_columns].isnull().sum())

cap-shape               0
cap-surface             0
cap-color               0
does-bruise-or-bleed    0
gill-attachment         0
gill-spacing            0
gill-color              0
stem-color              0
has-ring                0
ring-type               0
habitat                 0
dtype: int64


## float type_시각화

In [19]:
# 새로운 이진 변수 'is_spring_or_fall' 생성
train_data['is_spring_or_fall'] = train_data['season'].apply(lambda x: 1 if x in ['spring', 'fall'] else 0)
1
# 결과 확인
print(train_data[['season', 'is_spring_or_fall']].head())

   season  is_spring_or_fall
0       0                  0
1       0                  0
2       1                  0
3       2                  0
4       2                  0


## 최종 data 확인

In [20]:
display(train_data.head(20))
print(train_data.shape)

Unnamed: 0,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-color,has-ring,ring-type,habitat,season,is_spring_or_fall
0,8.64,x,Unknown,n,t,Unknown,Unknown,w,11.13,17.12,w,t,Unknown,d,0,0
1,6.9,Unknown,t,o,f,Unknown,c,y,1.27,10.75,n,f,f,d,0,0
2,2.0,b,g,n,f,Unknown,c,n,6.18,3.14,n,f,f,d,1,0
3,3.47,x,t,n,f,s,c,n,4.98,8.51,w,t,Unknown,d,2,0
4,6.17,x,h,y,f,p,Unknown,y,6.73,13.7,y,t,Unknown,d,2,0
5,4.43,x,h,n,f,x,c,n,5.36,5.5,n,t,Unknown,d,0,0
6,2.92,x,d,n,f,p,Unknown,Unknown,4.83,10.27,y,f,f,d,0,0
7,2.59,Unknown,Unknown,Unknown,f,Unknown,f,Unknown,2.73,12.71,Unknown,f,f,d,0,0
8,4.13,x,t,o,f,a,c,n,5.36,6.59,Unknown,t,Unknown,d,3,0
9,11.91,f,Unknown,Unknown,f,Unknown,c,Unknown,5.32,20.2,w,t,f,d,0,0


(2077964, 16)


In [21]:
# 타입
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2077964 entries, 0 to 2077963
Data columns (total 16 columns):
 #   Column                Dtype  
---  ------                -----  
 0   cap-diameter          float64
 1   cap-shape             object 
 2   cap-surface           object 
 3   cap-color             object 
 4   does-bruise-or-bleed  object 
 5   gill-attachment       object 
 6   gill-spacing          object 
 7   gill-color            object 
 8   stem-height           float64
 9   stem-width            float64
 10  stem-color            object 
 11  has-ring              object 
 12  ring-type             object 
 13  habitat               object 
 14  season                int64  
 15  is_spring_or_fall     int64  
dtypes: float64(3), int64(2), object(11)
memory usage: 253.7+ MB


In [22]:
# 고유값
unique_counts = train_data.nunique()
print(unique_counts)

cap-diameter            3745
cap-shape                  5
cap-surface                8
cap-color                  7
does-bruise-or-bleed       3
gill-attachment            7
gill-spacing               4
gill-color                 7
stem-height             2664
stem-width              5610
stem-color                 4
has-ring                   3
ring-type                  2
habitat                    4
season                     4
is_spring_or_fall          1
dtype: int64


In [23]:
# null 값
null_columns = train_data.isnull().sum()

# null 값이 있는 컬럼만 필터링
null_columns_with_data = null_columns[null_columns > 0]

# null 값이 있는 컬럼과 해당 null 값의 개수를 출력
print("Null 값이 있는 컬럼과 그 개수:")
print(null_columns_with_data)

# null 값이 있는 컬럼의 수를 출력
print(f"\nNull 값이 있는 컬럼의 수: {len(null_columns_with_data)}")

Null 값이 있는 컬럼과 그 개수:
Series([], dtype: int64)

Null 값이 있는 컬럼의 수: 0


## 파일 저장

In [27]:
train_data.to_csv(r"/content/drive/MyDrive/쿠글 1차 프로젝트/test_cleaned.csv")

print('저장 완료')

저장 완료


In [26]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
