In [45]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np

# read_csv() 함수로 df 생성
df = pd.read_csv('./data/auto-mpg.csv', header=None)

# 열 이름을 지정
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name'] 

# horsepower 열의 누락 데이터('?') 삭제하고 실수형으로 변환
df['horsepower'] = df['horsepower'].replace('?', np.nan)      # '?'을 np.nan으로 변경
df = df.dropna(subset=['horsepower'], axis=0)                 # 누락데이터 행을 삭제
df['horsepower'] = df['horsepower'].astype('float')           # 문자열을 실수형으로 변환

# np.histogram 으로 3개의 bin으로 나누는 경계 값의 리스트 구하기
count, bin_dividers = np.histogram(df['horsepower'], bins=3)

# 3개의 bin에 이름 지정
bin_names = ['저출력', '보통출력', '고출력']

# pd.cut 으로 각 데이터를 3개의 bin에 할당
df['hp_bin'] = pd.cut(x=df['horsepower'],     # 데이터 배열
                      bins=bin_dividers,      # 경계 값 리스트
                      labels=bin_names,       # bin 이름
                      include_lowest=True)    # 첫 경계값 포함

# sklern 라이브러리 불러오기
from sklearn import preprocessing    

# 전처리를 위한 encoder 객체 만들기
label_encoder = preprocessing.LabelEncoder()       # label encoder 생성
onehot_encoder = preprocessing.OneHotEncoder()     # one hot encoder 생성

# label encoder로 문자열 범주를 숫자형 범주로 변환
onehot_labeled = label_encoder.fit_transform(df['hp_bin'].head(15))  
print(onehot_labeled)
print(type(onehot_labeled))

# 2차원 행렬로 형태 변경
onehot_reshaped = onehot_labeled.reshape(len(onehot_labeled), 1) 
print(onehot_reshaped)
print(type(onehot_reshaped))

# 희소행렬로 변환
onehot_fitted = onehot_encoder.fit_transform(onehot_reshaped)
print(onehot_fitted)
print(type(onehot_fitted))

[1 1 1 1 1 0 0 0 0 0 0 1 1 0 2]
<class 'numpy.ndarray'>
[[1]
 [1]
 [1]
 [1]
 [1]
 [0]
 [0]
 [0]
 [0]
 [0]
 [0]
 [1]
 [1]
 [0]
 [2]]
<class 'numpy.ndarray'>
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 15 stored elements and shape (15, 3)>
  Coords	Values
  (0, 1)	1.0
  (1, 1)	1.0
  (2, 1)	1.0
  (3, 1)	1.0
  (4, 1)	1.0
  (5, 0)	1.0
  (6, 0)	1.0
  (7, 0)	1.0
  (8, 0)	1.0
  (9, 0)	1.0
  (10, 0)	1.0
  (11, 1)	1.0
  (12, 1)	1.0
  (13, 0)	1.0
  (14, 2)	1.0
<class 'scipy.sparse._csr.csr_matrix'>


In [4]:
# 라이브러리 불러오기
import pandas as pd
import numpy as np

# read_csv() 함수로 df 생성
df = pd.read_csv('./data/auto-mpg.csv', header=None)

# 열 이름을 지정
df.columns = ['mpg','cylinders','displacement','horsepower','weight',
              'acceleration','model year','origin','name'] 
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.00,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.00,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.00,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.00,2625.0,18.6,82,1,ford ranger


In [50]:
# 연비를 3개 구간으로 나눠서 새로운 컬럼명으로 추가
# 저연비, 보통, 고연비
# 연속형 (수치형)  --> 범주형 (category)

df.mpg
df['mpg_qcut']=pd.qcut(df.mpg, q=3, labels=['저연비','보통', '고연비'])
size,bins = np.histogram(df.mpg, bins=np.array(3))
df['mpg_cut']=pd.cut(df.mpg, bins=bins, labels=['저연비2', '보통2', '고연비2'])

df.mpg_qcut.value_counts()
# print(df.mpg_qcut.value_counts())
print(f'계급별 구간 : {bins}')

#구간별 데이터 범위 확인하는 방법 : 라벨없이 적용 후 cat.categories 하기
temp = pd.qcut(df.mpg, q=3)
temp
print(f'구간별 데이터 범위 : {temp.cat.categories}')
# IntervalIndex([(8.999, 19.0], (19.0, 26.933], (26.933, 46.6]], dtype='interval[float64, right]')

df

계급별 구간 : [ 9.         21.53333333 34.06666667 46.6       ]
구간별 데이터 범위 : IntervalIndex([(8.999, 18.733], (18.733, 26.933], (26.933, 46.6]], dtype='interval[float64, right]')


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,hp_bin,mpg_qcut,mpg_cut
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,보통출력,저연비,저연비2
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,보통출력,저연비,저연비2
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,보통출력,저연비,저연비2
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,보통출력,저연비,저연비2
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,보통출력,저연비,저연비2
...,...,...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl,저출력,고연비,보통2
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup,저출력,고연비,고연비2
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage,저출력,고연비,보통2
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger,저출력,고연비,보통2


In [52]:
pd.get_dummies(df.mpg_qcut)
pd.get_dummies(df.mpg_qcut, dtype=int)
pd.get_dummies(df.mpg_qcut, dtype=float)

Unnamed: 0,저연비,보통,고연비
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
393,0.0,0.0,1.0
394,0.0,0.0,1.0
395,0.0,0.0,1.0
396,0.0,0.0,1.0


In [53]:
pd.get_dummies(df.mpg_cut)
pd.get_dummies(df.mpg_cut, dtype=int)
pd.get_dummies(df.mpg_cut, dtype=float)

Unnamed: 0,저연비2,보통2,고연비2
0,1.0,0.0,0.0
1,1.0,0.0,0.0
2,1.0,0.0,0.0
3,1.0,0.0,0.0
4,1.0,0.0,0.0
...,...,...,...
393,0.0,1.0,0.0
394,0.0,0.0,1.0
395,0.0,1.0,0.0
396,0.0,1.0,0.0


In [None]:
%pip install scikit-learn

In [None]:
# 다중 공선성 : 회귀분석(여러변수로 결과를 예측하는 모델), 독립변수(컬럼들..)끼리 서로 너무 강하게 상관관계를 가질 때 문제가 생김

from sklearn.preprocessing import OneHotEncoder 
# encoder = OneHotEncoder()
# encoder.fit_transform(df)
# 상기 출력
#  <Compressed Sparse Row sparse matrix of dtype 'float64'
          #with 3920 stored elements and shape (392, 1067)>  1067?? 전체가 생김??
df.mpg.ndim   #출력 1차원

encoder = OneHotEncoder(sparse_output=False)  #객체

# encoder2 = OneHotEncoder(sparse_output=True)  #True가 기본이고 True로 하게되면 압축된 파일이 나와서 False해야지 값이 나옴.
# encoder2.fit_transform(df[['mpg_qcut']])

# sklearn 계열은 사용방법티 통일 ==> fit: 적용 / transform : 변환 ==> fit_transform 은 두개를 한꺼번에 실행
#모델화는 fit만 해도됨 / 유틸리티는 fit_transform
temp = encoder.fit_transform(df[['mpg_qcut']])  #2차원 데이터가 와야함
# sklearn 은 유틸리티, 머신러닝 함수 2개가 있음
cols=encoder.get_feature_names_out(['mpg_qcut'])
# 더미와 차이 : 더미는 저연비 고연비를 컬럼으로 만들었는데, 이건array(['mpg_qcut_고연비', 'mpg_qcut_보통', 'mpg_qcut_저연비'], dtype=object)

pd.DataFrame(temp, columns=cols)  
pd.concat([df.drop(columns=['mpg_qcut']), pd.DataFrame(temp, columns=cols)], axis=1)  

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,hp_bin,mpg_cut,mpg_qcut_고연비,mpg_qcut_보통,mpg_qcut_저연비
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu,보통출력,저연비2,0.0,0.0,1.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320,보통출력,저연비2,0.0,0.0,1.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite,보통출력,저연비2,0.0,0.0,1.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst,보통출력,저연비2,0.0,0.0,1.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino,보통출력,저연비2,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,,,,,,,,,,,,0.0,0.0,1.0
330,,,,,,,,,,,,1.0,0.0,0.0
336,,,,,,,,,,,,0.0,1.0,0.0
354,,,,,,,,,,,,1.0,0.0,0.0


In [62]:
# 상기 코딩에서
# encoder = OneHotEncoder(sparse_output=True)  "True"로 하게되면 하기와 같이 !!

from sklearn.preprocessing import OneHotEncoder 

encoder = OneHotEncoder(sparse_output=True)  #객체
temp = encoder.fit_transform(df[['mpg_qcut']])  
cols=encoder.get_feature_names_out(['mpg_qcut'])
pd.DataFrame.sparse.from_spmatrix(temp, columns=cols)
pd.concat([df.drop(columns=['mpg_qcut']), pd.DataFrame.sparse.from_spmatrix(temp, columns=cols)], axis=1)  

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,hp_bin,mpg_cut,mpg_qcut_고연비,mpg_qcut_보통,mpg_qcut_저연비
0,18.0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu,보통출력,저연비2,0,0,1.0
1,15.0,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320,보통출력,저연비2,0,0,1.0
2,18.0,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite,보통출력,저연비2,0,0,1.0
3,16.0,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst,보통출력,저연비2,0,0,1.0
4,17.0,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino,보통출력,저연비2,0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,,,,,,,,,,,,0,0,1.0
330,,,,,,,,,,,,1.0,0,0
336,,,,,,,,,,,,0,1.0,0
354,,,,,,,,,,,,1.0,0,0


In [None]:
# 순차적으로 된 값..??

# get_dummy(): pandas DataFrame / Series 형태로 주어짐
    #바로 적용이 가능하도록 단순함. 
    #빠르게 확인
    #컬럼명 유지
    #데이터 탐색, 시각화, 작은데이터셋에 유리
# OneHotEncoder() : numpy array / DataFrame / class?
    #머신러닝 라이브러리에서 제공
    #머신러닝에서 파이라인을 이용해서 모델을 학습할 때 사용
    #fit, transform ==> 학습-예측데이터 일관성 유지, 큰데이터셋에 유리


In [74]:
train_df.head(2)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,hp_bin,mpg_qcut,mpg_cut
0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,보통출력,저연비,저연비2
1,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,보통출력,저연비,저연비2


In [None]:
train_df = df.drop(columns=['mpg'])
train_df.head()
# cylinders, model year, origin --> OneHot으로 변경
# OneHot 이후에 OneHot 에 대상이된 컬럼은 drop
# 제조사 컬럼에서 제조사만 추출해서 --> OneHot으로 변경
# 하나의 데이터프레임으로 결합 concat


from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
origin_cols = ['cylinders','model year', 'origin','maker']
train_df['maker'] = [n_list[0] for n_list in df['name'].str.split()]
total_onehots = []
for colname in origin_cols:
    encoder_fit_transform = encoder.fit_transform(train_df[[colname]])   #모델화는 fit만 해도됨 / 유틸리티는 fit_transform
    cols = encoder.get_feature_names_out([colname])
    total_onehots.append(pd.DataFrame.sparse.from_spmatrix(encoder_fit_transform,columns=cols))




In [81]:
total_onehots.insert(0,train_df)
new_train_df = pd.concat(total_onehots,axis=1)
new_train_df = new_train_df.drop(columns=origin_cols)
new_train_df.head()

Unnamed: 0,displacement,horsepower,weight,acceleration,name,hp_bin,mpg_qcut,mpg_cut,cylinders_3,cylinders_4,...,maker_renault,maker_saab,maker_subaru,maker_toyota,maker_toyouta,maker_triumph,maker_vokswagen,maker_volkswagen,maker_volvo,maker_vw
0,307.0,130.0,3504.0,12.0,chevrolet chevelle malibu,보통출력,저연비,저연비2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,350.0,165.0,3693.0,11.5,buick skylark 320,보통출력,저연비,저연비2,0,0,...,0,0,0,0,0,0,0,0,0,0
2,318.0,150.0,3436.0,11.0,plymouth satellite,보통출력,저연비,저연비2,0,0,...,0,0,0,0,0,0,0,0,0,0
3,304.0,150.0,3433.0,12.0,amc rebel sst,보통출력,저연비,저연비2,0,0,...,0,0,0,0,0,0,0,0,0,0
4,302.0,140.0,3449.0,10.5,ford torino,보통출력,저연비,저연비2,0,0,...,0,0,0,0,0,0,0,0,0,0


In [82]:
pd.concat(total_onehots,axis=1)

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model year,origin,name,hp_bin,mpg_qcut,...,maker_renault,maker_saab,maker_subaru,maker_toyota,maker_toyouta,maker_triumph,maker_vokswagen,maker_volkswagen,maker_volvo,maker_vw
0,8.0,307.0,130.0,3504.0,12.0,70.0,1.0,chevrolet chevelle malibu,보통출력,저연비,...,0,0,0,0,0,0,0,0,0,0
1,8.0,350.0,165.0,3693.0,11.5,70.0,1.0,buick skylark 320,보통출력,저연비,...,0,0,0,0,0,0,0,0,0,0
2,8.0,318.0,150.0,3436.0,11.0,70.0,1.0,plymouth satellite,보통출력,저연비,...,0,0,0,0,0,0,0,0,0,0
3,8.0,304.0,150.0,3433.0,12.0,70.0,1.0,amc rebel sst,보통출력,저연비,...,0,0,0,0,0,0,0,0,0,0
4,8.0,302.0,140.0,3449.0,10.5,70.0,1.0,ford torino,보통출력,저연비,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
330,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
336,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
354,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0


< 머신러닝 학습 >
- 범주형 데이터 : 범위내에서 결정되는 데이터 - category 
            ==> 머신러닝이 이해하기 쉽게 데이터 가공 필요
- 연속형 데이터 : 범위가 없는 변화무쌍한 데이터 ==> 그대로 머신러닝에 줘도됨

- example
고양이, 새, 개 ==> 범주형 데이터 0과 1로 표현
고양이 [1,0,0] ==> 고양이 on 나머지 off
새 [0,1,0]
개 [0,0,1]
만약 고양이 0, 새1, 개2 이렇게 하게되면 
숫자가 증감의 의미를 가지는 것으로 머신러닝이 인지하므로, 이렇게 숫자로 할 수 없음

- 키 예측 모델
: 신발사이즈를 변수로 사용 1(작다), 2(중간), 3(크다) 라고 정의했다면,
3번 신발 신는사람은 1번 신발을 신은 사람보다 신발사이즈 변수는 3배 증가
==> 이를 머신러닝은 키도 3배 크겠다로 인식하게 됨. 
but, 실제 신발사이즈 변수는 카테고리이지, 값이 의미를 갖는 것은 아님.

- ==> 그래서 범주형 데이터를 OneHot으로 변경해서 각각 카테고리를 머신러닝이 인지하도록 변경해줌.

< OneHot을 하는 이유 >
- 영향을 막아줌


# get_dummy(): pandas DataFrame / Series 형태로 주어짐
    #바로 적용이 가능하도록 단순함. 
    #빠르게 확인
    #컬럼명 유지
    #데이터 탐색, 시각화, 작은데이터셋에 유리
# OneHotEncoder() : numpy array / DataFrame / class?
    #머신러닝 라이브러리에서 제공
    #머신러닝에서 파이라인을 이용해서 모델을 학습할 때 사용
    #fit, transform ==> 학습-예측데이터 일관성 유지, 큰데이터셋에 유리



In [88]:
from sklearn.linear_model import LinearRegression #선형회귀2: 데이터를 그래프화해서 모르는 데이터를 찾아내는 과정? 
# 작은: 0 보통 1 크 2
data = {
    'size' : [0, 0, 0, 1, 1,1, 2,2,2],
    'height' : [120, 121, 119, 122, 123, 121, 125, 124, 126]
}

df = pd.DataFrame(data)
#실제평균 : 참고용
df.groupby('size')['height'].mean()

size
0    120.0
1    122.0
2    125.0
Name: height, dtype: float64

In [None]:
# 머신러닝 : OneHot을 안하고 했을 때!!! 
model_lr = LinearRegression()
X = df.drop(columns=['height'])
Y = df['height']
X.shape, Y.shape, type(Y)
model_ly = LinearRegression()
model_ly.fit(X,Y)
# 예측
predicted_y0 = model_ly.predict([[0]])[0]   #X(size) 형태와 동일하게 데이터? 줌..// 2차원으로 줘야함 ([[0]])
predicted_y1 = model_ly.predict([[1]])[0]
predicted_y2 = model_ly.predict([[2]])[0]
predicted_y0, predicted_y1, predicted_y2



(np.float64(119.83333333333333),
 np.float64(122.33333333333333),
 np.float64(124.83333333333333))

In [None]:
# 머신러닝 : OneHot 적용 : 상기 df.groupby('size')['height'].mean() 실제 평균값과 동일. 차이가 매우 근소하게 할수잇음!
df_encoded = pd.get_dummies(df, columns=['size'])
X_onehot = df_encoded[['size_0', 'size_1']].to_numpy()
model_onehot=LinearRegression()
model_onehot.fit(X_onehot, Y)   
#sklearn 모델은 fit만 해도됨 / sklearn 유틸리티는 fit_transform
#Scikit-learn OneHotEncoder → fit + transform 필요.
#Scikit-learn LinearRegression().fit() → 여기서의 fit은 모델 학습 단계.

#예측
predicted_onehot_y0 = model_onehot.predict([[1,0]])[0]   
predicted_onehot_y1 = model_onehot.predict([[0,1]])[0]
predicted_onehot_y2 = model_onehot.predict([[0,0]])[0]
predicted_onehot_y0, predicted_onehot_y1, predicted_onehot_y2

(np.float64(120.0), np.float64(122.0), np.float64(125.0))

In [87]:
pd.get_dummies(df.height)


Unnamed: 0,119,120,121,122,123,124,125,126
0,False,True,False,False,False,False,False,False
1,False,False,True,False,False,False,False,False
2,True,False,False,False,False,False,False,False
3,False,False,False,True,False,False,False,False
4,False,False,False,False,True,False,False,False
5,False,False,True,False,False,False,False,False
6,False,False,False,False,False,False,True,False
7,False,False,False,False,False,True,False,False
8,False,False,False,False,False,False,False,True
