In [1]:
!pip install category_encoders

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd 
import numpy as np 

import os
import sys

import matplotlib.pyplot as plt
import seaborn as sns
import time

import lightgbm as lgbm
import sklearn
from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD, PCA, FastICA, FactorAnalysis, KernelPCA, DictionaryLearning
from sklearn.decomposition import IncrementalPCA, LatentDirichletAllocation,MiniBatchSparsePCA, SparsePCA

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer

In [4]:
df = pd.read_csv('/content/Regression_data.csv')
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [5]:
def check_missing_col(dataframe):
    missing_col = []
    counted_missing_col = 0
    for i, col in enumerate(dataframe.columns):
        missing_values = sum(dataframe[col].isna())
        is_missing = True if missing_values >= 1 else False
        if is_missing:
            counted_missing_col += 1
            print(f'결측치가 있는 컬럼은: {col}입니다')
            print(f'해당 컬럼에 총 {missing_values}개의 결측치가 존재합니다.')
            missing_col.append([col, dataframe[col].dtype])
    if counted_missing_col == 0:
        print('결측치가 존재하지 않습니다')
    return missing_col

missing_col = check_missing_col(df)

결측치가 존재하지 않습니다


In [6]:
#라벨인코딩을 하기 위함 dictionary map 생성 함수
def make_label_map(dataframe):
    label_maps = {}
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            label_map = {'unknown':0}
            for i, key in enumerate(dataframe[col].unique()):
                label_map[key] = i+1  #새로 등장하는 유니크 값들에 대해 1부터 1씩 증가시켜 키값을 부여해줍니다.
            label_maps[col] = label_map
    print(label_maps)
    return label_maps

In [7]:
# 각 범주형 변수에 인코딩 값을 부여하는 함수
def label_encoder(dataframe, label_map):
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            dataframe[col] = dataframe[col].map(label_map[col])
            dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.
    return dataframe

In [8]:
train_le = make_label_map(df[['Sex']])
Sex_df = label_encoder(df[['Sex']], train_le)
df['Sex'] = Sex_df[['Sex']]

{'Sex': {'unknown': 0, 'M': 1, 'F': 2, 'I': 3}}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[col] = dataframe[col].map(label_map[col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.


In [9]:
df.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,2,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,1,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,3,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [10]:
X = df.drop(['Rings'], axis = 1)
y = df['Rings']

In [11]:
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=42)

In [12]:
X_train.shape,X_test.shape

((3341, 8), (836, 8))

In [13]:
X_train.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
4038,3,0.55,0.445,0.125,0.672,0.288,0.1365,0.21
1272,3,0.475,0.355,0.1,0.5035,0.2535,0.091,0.14
3384,2,0.305,0.225,0.07,0.1485,0.0585,0.0335,0.045
3160,3,0.275,0.2,0.065,0.1165,0.0565,0.013,0.035
3894,1,0.495,0.38,0.135,0.6295,0.263,0.1425,0.215


In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
model = RandomForestRegressor() # 모델을 객체에 할당

model.fit(X_train, y_train) # 모델 학습

In [16]:
Sex_df = label_encoder(X_test[['Sex']], train_le)
X_test['Sex'] = Sex_df[['Sex']]

X_test.head()

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight
866,1,0.605,0.455,0.16,1.1035,0.421,0.3015,0.325
1483,1,0.59,0.44,0.15,0.8725,0.387,0.215,0.245
599,2,0.56,0.445,0.195,0.981,0.305,0.2245,0.335
1702,2,0.635,0.49,0.17,1.2615,0.5385,0.2665,0.38
670,1,0.475,0.385,0.145,0.6175,0.235,0.108,0.215


In [17]:
prediction = model.predict(X_test)

In [18]:
print(prediction[0])

12.16


In [19]:
print(model.score(X_train, y_train))

0.9375239401512313


In [20]:
print(model.score(X_test, y_test))

0.5399298636900028
