<a href="https://colab.research.google.com/github/Edeline24601/ai-assignment/blob/main/%5BAI2025_Assignment%5D_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignments

The assignment of using the chemical composition data of wine to freely train your own classification model among several classifications to identify the type of wine and to check the accuracy of the model.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

##Data load and Preprocessing

In [2]:
from sklearn.datasets import load_wine

wine = load_wine()

##Inspecting Data and Splitting Train_set, Test_set

* test_size = 0.3
* Check how the data is organized through print(wine['DESCR'])

In [3]:
print(wine.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


In [4]:
# Check the Dataset Descriptions
print(wine['DESCR'])

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

:Number of Instances: 178
:Number of Attributes: 13 numeric, predictive attributes and the class
:Attribute Information:
    - Alcohol
    - Malic acid
    - Ash
    - Alcalinity of ash
    - Magnesium
    - Total phenols
    - Flavanoids
    - Nonflavanoid phenols
    - Proanthocyanins
    - Color intensity
    - Hue
    - OD280/OD315 of diluted wines
    - Proline
    - class:
        - class_0
        - class_1
        - class_2

:Summary Statistics:

                                Min   Max   Mean     SD
Alcohol:                      11.0  14.8    13.0   0.8
Malic Acid:                   0.74  5.80    2.34  1.12
Ash:                          1.36  3.23    2.36  0.27
Alcalinity of Ash:            10.6  30.0    19.5   3.3
Magnesium:                    70.0 162.0    99.7  14.3
Total Phenols:                0.98  3.88    2.29  0.63
Flavanoids:                   0.34  5.08    2.03  1.00

In [5]:
print(wine.feature_names)
print(wine.target_names)

['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
['class_0' 'class_1' 'class_2']


### Please define X and Y to predict `class` through `wine_df`.

In [6]:
wine_df = pd.DataFrame(wine['data'], columns=wine.feature_names)
wine_df['class'] = wine.target

# Setting X, Y
X = wine_df.drop('class', axis=1)
Y = wine_df['class']

In [7]:
print(X.head())

   alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
0    14.23        1.71  2.43               15.6      127.0           2.80   
1    13.20        1.78  2.14               11.2      100.0           2.65   
2    13.16        2.36  2.67               18.6      101.0           2.80   
3    14.37        1.95  2.50               16.8      113.0           3.85   
4    13.24        2.59  2.87               21.0      118.0           2.80   

   flavanoids  nonflavanoid_phenols  proanthocyanins  color_intensity   hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   od280/od315_of_diluted_wines  proline  
0                  

In [8]:
print(Y.head())
print(Y.tail())

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64
173    2
174    2
175    2
176    2
177    2
Name: class, dtype: int64


In [9]:
################### Freely Preprocess (Scaling MUST be done after splitting train/test!) ######################
print(wine_df.isnull().sum())

# 결측치가 없으므로 imputation이 필요 없음. 따라서 이상치를 처리하는 방식을 쓰도록 합니다.
def handle_outliers_iqr(df, features, method):
    """
    IQR(Interquartile Range) 방법을 사용하여 데이터프레임의 이상치를 처리합니다.

    Args:
        df (pd.DataFrame): 이상치를 처리할 데이터프레임.
        features (list): 이상치를 확인할 특성(컬럼) 이름 리스트.
        method (str): 이상치 처리 방법 ('capping' 또는 'removal').
                      - 'capping': 이상치를 상한/하한 경계값으로 대체.
                      - 'removal': 이상치 행을 데이터프레임에서 제거.

    Returns:
        pd.DataFrame: 이상치가 처리된 새로운 데이터프레임.
    """
    df_processed = df.copy()

    # ------------------ 1. 이상치 경계 계산 ------------------
    Q1 = df_processed[features].quantile(0.25)
    Q3 = df_processed[features].quantile(0.75)
    IQR = Q3 - Q1

    # 이상치 경계 (1.5 * IQR 기준)
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    # ------------------ 2. 이상치 처리 로직 적용 ------------------
    if method == 'capping':
        print("Capping: 이상치를 경계값으로 대체합니다.")
        for feature in features:
            # 상한을 초과하는 값은 상한 값으로 대체
            df_processed.loc[df_processed[feature] > upper_bound[feature], feature] = upper_bound[feature]
            # 하한 미만인 값은 하한 값으로 대체
            df_processed.loc[df_processed[feature] < lower_bound[feature], feature] = lower_bound[feature]

    elif method == 'removal':
        print("Removal: 이상치를 포함하는 행을 제거합니다.")
        # 이상치에 해당하는 모든 행을 찾기 위한 마스크 생성
        is_outlier = pd.Series([False] * len(df_processed), index=df_processed.index)

        for feature in features:
            # 해당 특성의 이상치 마스크를 누적 (하나라도 이상치면 제거 대상)
            is_outlier = is_outlier | (df_processed[feature] < lower_bound[feature]) | (df_processed[feature] > upper_bound[feature])

        # 이상치가 아닌 행만 남김 (이상치 행 제거)
        df_processed = df_processed[~is_outlier]

    else:
        raise ValueError("method는 'capping' 또는 'removal'이어야 합니다.")
    return df_processed

features = wine.feature_names
X_capped = handle_outliers_iqr(X, features, 'capping')

alcohol                         0
malic_acid                      0
ash                             0
alcalinity_of_ash               0
magnesium                       0
total_phenols                   0
flavanoids                      0
nonflavanoid_phenols            0
proanthocyanins                 0
color_intensity                 0
hue                             0
od280/od315_of_diluted_wines    0
proline                         0
class                           0
dtype: int64
Capping: 이상치를 경계값으로 대체합니다.


### Split train and test data through `train_test_split`.
- test_size = 0.3, random_state = 0

In [10]:
from sklearn.model_selection import train_test_split

# Splitting Train/Test data
X_train, X_test, Y_train, Y_test = train_test_split(X_capped, Y, test_size = 0.3, random_state = 0, stratify = Y)

print(X_train)

     alcohol  malic_acid   ash  alcalinity_of_ash  magnesium  total_phenols  \
143    13.62        4.95  2.35               20.0       92.0           2.00   
33     13.76        1.53  2.70               19.5      132.0           2.95   
30     13.73        1.50  2.70               22.5      101.0           3.00   
34     13.51        1.80  2.65               19.0      110.0           2.35   
135    12.60        2.46  2.20               18.5       94.0           1.62   
..       ...         ...   ...                ...        ...            ...   
82     12.08        1.13  2.51               24.0       78.0           2.00   
174    13.40        3.91  2.48               23.0      102.0           1.80   
146    13.88        5.04  2.23               20.0       80.0           0.98   
74     11.96        1.09  2.30               21.0      101.0           3.38   
79     12.70        3.87  2.40               23.0      101.0           2.83   

     flavanoids  nonflavanoid_phenols  proanthocyan

In [11]:
###### Freely Preprocessing 2 : Scaling ( After splitting; Data Leakage) #######
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

##Build Models Using Various Models

In [12]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Generate Model
model = KNeighborsClassifier(n_neighbors = 5)

# Fitting
model.fit(X_train_scaled, Y_train)

# Get predict value by predicting with test data
Y_predict = model.predict(X_test_scaled)

##Check the fitted model's performance


In [13]:
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_predict))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        18
           1       1.00      0.90      0.95        21
           2       0.94      1.00      0.97        15

    accuracy                           0.96        54
   macro avg       0.96      0.97      0.96        54
weighted avg       0.97      0.96      0.96        54



### Print the Accuracy.

In [14]:
# model accuarcy
accuracy = model.score(X_test_scaled, Y_test)
print("Accuracy: {0:.3f}".format(accuracy))

Accuracy: 0.963


## Grading

- Accuracy >= 95 : **50 pt(Perfect)**
- Accuracy >= 93, < 95 : **45 pt**
- Accuracy >= 90, < 93 : **40 pt**
- Accuracy < 90 : **35 pt**