# I. 빅데이터 분석 과정


## 1. 빅데이터 분석과정 이해

1. 필요한 패키지 import
    - numpy, pandas, scikit-learn, matplotlib

In [2]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [1]:
from sklearn.tree import DecisionTreeClassifier  # 의사결정나부 분류모델
from sklearn.model_selection import train_test_split  # 학습 및 테스트 데이터셋 분리

2. 데이터 불러오기

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv')

3. 데이터 살펴보기

In [4]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [10]:
df.shape

(150, 5)

In [11]:
df.info()
print()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB



Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


4. 데이터 전처리

In [12]:
df['species'].replace({'setosa' : 0, 'versicolor' : 1, 'virginica' : 2}, inplace = True)  # 범주형 데이터 간단히 변환
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


5. 분석 데이터셋 준비

In [13]:
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]  # 독립변수 설정
y = df['species']  # 종속변수 설정

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)  # test_size : 테스트 데이터의 비율, random_state : 난수 seed

In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(120, 4)
(30, 4)
(120,)
(30,)


6. 데이터분석 수행

- 지도학습 - 분류
    - 의사결정나무(분류), KNN, 서포트벡터머신(SVM), 로지스틱 회귀분석, 랜덤포레스트, 인공신경망
- 지도학습 - 회귀(예측)
    - 선형회귀분석, 다중회귀분석, 의사결정나무(회귀)
- 비지도학습
    - 군집분석(Clustering), 연관분석(Association Analysis), 인공신경망(Neural Networks)

In [15]:
dt = DecisionTreeClassifier(random_state = 11)  # 의사결정나무 객체 생성
dt.fit(X_train, y_train)  # 의사결정 나무 객체에 훈련데이터(독립변수, 종속변수) 입력하여 학습 수행

In [16]:
pred = dt.predict(X_test)  # 학습된 객체에 X_test 데이터를 입력해 y값을 예측

from sklearn.metrics import accuracy_score  # sklearn의 정확도 측정 함수 호출
acc = accuracy_score(y_test, pred)  # y_test값과 객체에서 예측한 값 비교
print(acc)

0.9333333333333333


# II. 지도학습 - 분류

> 기존에 존재하는 데이터들간의 분류 카테고리를 학습, 파악하고 새로운 데이터에 대한 분류 카테고리를 판별하는 과정

- 의사결정나무, KNN, 로지스틱 회귀, 랜덤 포레스트, 나이브베이즈, 신경망, 서포트벡터머신

## 1. 의사결정나무를 이용한 분류 문제 해결

> 의사결정을 위한 규칙을 나무 모양으로 조합하여 목표변수(종속변수)에 대한 분류를 수행

- 과대적합의 위험이 높으므로 적절히 조절
- 랜덤포레스트 : 부트스트래핑 샘플링으로 의사결정나무들을 생성해 앙상블 학습하여 숲을 형성한 것

1) 타이타닉 데이터셋 탑승자 생존여부 예측

In [17]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [19]:
# 데이터 불러오기
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv')

In [20]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [21]:
df.info()  # 결측치 여부 확인

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [22]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [23]:
# 전처리 수행
d_mean = df['Age'].mean()
df['Age'].fillna(d_mean, inplace = True)  # Age 컬럼의 결측값을 평균으로 대체

d_mode = df['Embarked'].mode()[0]
df['Embarked'].fillna(d_mode, inplace = True)  # Embarked 컬럼의 결측값을 최빈값으로 대체

In [24]:
from sklearn.preprocessing import LabelEncoder
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])  # Sex 칼럼의 값 0, 1로 레이블 인코딩

from sklearn.preprocessing import LabelEncoder
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])  # Embarked 칼럼의 값 레이블 인코딩

In [25]:
df['FamilySize'] = df['SibSp'] + df['Parch']  # 형재/배우자 수 + 부모/자녀 수, 파생변수

In [26]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize
0,1,0,3,"Braund, Mr. Owen Harris",1,22.000000,1,0,A/5 21171,7.2500,,2,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,38.000000,1,0,PC 17599,71.2833,C85,0,1
2,3,1,3,"Heikkinen, Miss. Laina",0,26.000000,0,0,STON/O2. 3101282,7.9250,,2,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,35.000000,1,0,113803,53.1000,C123,2,1
4,5,0,3,"Allen, Mr. William Henry",1,35.000000,0,0,373450,8.0500,,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",1,27.000000,0,0,211536,13.0000,,2,0
887,888,1,1,"Graham, Miss. Margaret Edith",0,19.000000,0,0,112053,30.0000,B42,2,0
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",0,29.699118,1,2,W./C. 6607,23.4500,,2,3
889,890,1,1,"Behr, Mr. Karl Howell",1,26.000000,0,0,111369,30.0000,C148,0,0


In [28]:
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']]
y = df['Survived']

In [29]:
X

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,3,1,22.000000,7.2500,2,1
1,1,0,38.000000,71.2833,0,1
2,3,0,26.000000,7.9250,2,0
3,1,0,35.000000,53.1000,2,1
4,3,1,35.000000,8.0500,2,0
...,...,...,...,...,...,...
886,2,1,27.000000,13.0000,2,0
887,1,0,19.000000,30.0000,2,0
888,3,0,29.699118,23.4500,2,3
889,1,1,26.000000,30.0000,0,0


In [30]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)

In [32]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(712, 6)
(179, 6)
(712,)
(179,)


In [33]:
dt = DecisionTreeClassifier(random_state = 11)
dt.fit(X_train, y_train)

In [34]:
pred = dt.predict(X_test)

In [35]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.7877094972067039


In [37]:
# 오차행렬을 이용한 성능 평가
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, pred)
print(mat)

[[98 20]
 [18 43]]


In [38]:
# 평가지표 계산
from sklearn.metrics import classification_report
rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

           0       0.84      0.83      0.84       118
           1       0.68      0.70      0.69        61

    accuracy                           0.79       179
   macro avg       0.76      0.77      0.77       179
weighted avg       0.79      0.79      0.79       179



## 2. KNN을 이용한 분류 문제 해결

> 데이터로부터 거리가 가까운 K개의 그룹으로 분류

In [39]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

In [40]:
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv')

In [41]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [43]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [44]:
from sklearn.preprocessing import MinMaxScaler  # Min-Max 정규화 : 모든 값을 0~1 사이의 값으로 변환
scaler = MinMaxScaler()
df[['sepal_length']] = scaler.fit_transform(df[['sepal_length']])  # df['sepal_length'] : Series 객체 반환, df[['sepal_length]] : dataframe 객체 반환
df[['sepal_length']] = scaler.fit_transform(df[['sepal_length']])
df[['sepal_length']] = scaler.fit_transform(df[['sepal_length']])
df[['sepal_length']] = scaler.fit_transform(df[['sepal_length']])

In [45]:
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)  # 훈련:테스트 = 80:20, 난수 시드 11

In [46]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(120, 4)
(30, 4)
(120,)
(30,)


In [47]:
knn = KNeighborsClassifier(n_neighbors = 3)  # KNN 객체 생성, n_neighbors = : 그룹 갯수
knn.fit(X_train, y_train)  # train 데이터 학습

In [48]:
pred = knn.predict(X_test)  # 학습된 모델로 y값 예측

In [49]:
# y_test값과 비교해 정확도 측정
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.9333333333333333


In [50]:
# 오차행렬을 이용한 성능 평가, 잘못 분류한 데이터 2개
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, pred)
print(mat)

[[ 9  0  0]
 [ 0 10  0]
 [ 0  2  9]]


In [51]:
# 평가지표 계산
from sklearn.metrics import classification_report
rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00         9
  versicolor       0.83      1.00      0.91        10
   virginica       1.00      0.82      0.90        11

    accuracy                           0.93        30
   macro avg       0.94      0.94      0.94        30
weighted avg       0.94      0.93      0.93        30



## 3. SVM을 이용한 분류 문제 해결

> 데이터를 분류하는 가장 큰 폭을 가진 경계를 찾는 알고리즘

* 원핫인코딩 : 범주형의 데이터를 숫자형으로 변환하는 방법

In [1]:
import numpy as np
import pandas as pd
import sklearn

from sklearn import svm
from sklearn.model_selection import train_test_split

In [15]:
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv')

In [3]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [16]:
# Age 컬럼의 결측치 평균값으로 대체
d_mean = df['Age'].mean()
df['Age'].fillna(d_mean, inplace = True)

# Embarked 칼럼의 결측치 최빈값으로 대체
d_mode = df['Embarked'].mode()[0]  # .mode() 만 사용시 불필요한 요소들도 같이 반환됨
df['Embarked'].fillna(d_mode, inplace = True)

# 파생변수, SibSp + Parch = FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch']

In [17]:
onehot_sex = pd.get_dummies(df['Sex'])  # pd.get_dummies() : 범주형 데이터를 숫자형으로 원핫 인코딩
df = pd.concat([df, onehot_sex], axis = 1)  # pd.concat() : 병합

onehot_embarked = pd.get_dummies(df['Embarked'])
df = pd.concat([df, onehot_embarked], axis = 1)

In [13]:
onehot_embarked  # pd.get_dummies 결과 예시

Unnamed: 0,C,Q,S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1
...,...,...,...
886,0,0,1
887,0,0,1
888,0,0,1
889,1,0,0


In [18]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,FamilySize,female,male,C,Q,S
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S,1,0,1,0,0,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C,1,1,0,1,0,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S,0,1,0,0,0,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S,1,1,0,0,0,1
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S,0,0,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S,0,0,1,0,0,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S,0,1,0,0,0,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,,S,3,1,0,0,0,1
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C,0,0,1,1,0,0


In [20]:
X = df[['Pclass', 'Age', 'Fare', 'FamilySize', 'female', 'male', 'C', 'Q', 'S']]
y = df['Survived']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(623, 9)
(268, 9)
(623,)
(268,)


In [23]:
# SVM 객체 생성
sv = svm.SVC(kernel = 'rbf')  # kernel = 'x', rbf(Radial Basis Function), linear, polynomial, sigmoid 등
sv.fit(X_train, y_train)

In [24]:
pred = sv.predict(X_test)

In [25]:
# 정확도 측정
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.7238805970149254


In [26]:
# 오차 행렬 성능 평가
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_test, pred)
print(mat)

[[167   7]
 [ 67  27]]


In [27]:
# 평가지료 계산
from sklearn.metrics import classification_report
rpt = classification_report(y_test, pred)
print(rpt)

              precision    recall  f1-score   support

           0       0.71      0.96      0.82       174
           1       0.79      0.29      0.42        94

    accuracy                           0.72       268
   macro avg       0.75      0.62      0.62       268
weighted avg       0.74      0.72      0.68       268



In [None]:
# 파라미터 조정
sv = svm.SVC(kernel ='rbf')

sv = svm.SVC(kernel = 'linear', C = 1, gamma = 0.1)  # C : 오차를 얼마나 허용할 것인지(클 수록 하드마진), gamma : 얼마나 경계를 유연하게 설정할 것인지(학습 데이터에 얼마나 민감하게 반응할 것인지)

sv = svm.SVC(kernel = 'rbf', C = 0.1, gamma = 0.1)

## 4. 로지스틱 회귀를 이용한 분류 문제 해결

> 가능성이 더 높은 범주에 속하는 것으로 분류하는 이진 분류 모델, 시그모이드 함수 사용
- LogisticRegression 클래스 내 penalty 매개변수에서 규제 유형 선택 : L1(라쏘 방식), L2(릿지 방식)
- LogisticRegression 클래스 내 C 매개변수에서 규제 강도 설정(작을 수록 강해짐)
- predict_proba() : 각 분류 항목에 속할 확률 확인
- decision_fuction() : 모델이 학습한 선형 방정식 확인

In [1]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/iris.csv')

In [3]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
df.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df[['sepal_length']] = scaler.fit_transform(df[['sepal_length']])
df[['sepal_width']] = scaler.fit_transform(df[['sepal_width']])
df[['petal_length']] = scaler.fit_transform(df[['petal_length']])
df[['petal_width']] = scaler.fit_transform(df[['petal_width']])

In [7]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625000,0.067797,0.041667,setosa
1,0.166667,0.416667,0.067797,0.041667,setosa
2,0.111111,0.500000,0.050847,0.041667,setosa
3,0.083333,0.458333,0.084746,0.041667,setosa
4,0.194444,0.666667,0.067797,0.041667,setosa
...,...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667,virginica
146,0.555556,0.208333,0.677966,0.750000,virginica
147,0.611111,0.416667,0.711864,0.791667,virginica
148,0.527778,0.583333,0.745763,0.916667,virginica


In [9]:
X = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = df['species']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)

In [10]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(120, 4)
(30, 4)
(120,)
(30,)


In [11]:
# LogisticRegression 객체 생성
lr = LogisticRegression()
lr.fit(X_train, y_train)  # 학습

In [12]:
pred = lr.predict(X_test)

In [13]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.8333333333333334


## 5. 랜덤 포레스트를 이용한 분류 문제 해결

> 다수의 의사결정 트리들을 배깅(같은 종류의 알고리즘 모델을 여러개 결합하여 예측하는 방법)하여 분류 또는 회귀를 수행하는 앙상블 기법
- 각 트리는 전체 학습 데이터 중 서로 다른 데이터를 샘플링하여 일부 데이터를 제외한 후 최적의 특징을 찾아 트리를 분기

In [14]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [15]:
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/titanic.csv')

In [16]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [18]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [19]:
# Age 칼럼의 결측값은 평균으로 대체
d_mean = df['Age'].mean()
df['Age'].fillna(d_mean, inplace = True)

# Embarked 칼럼의 결측값은 최빈값으로 대체
d_mode = df['Embarked'].mode()[0]
df['Embarked'].fillna(d_mode, inplace = True)

In [21]:
# Sex 칼럼의 값 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])

# Embarked 칼럼의 값 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
df['Embarked'] = LabelEncoder().fit_transform(df['Embarked'])

In [22]:
df['FamilySize'] = df['SibSp'] + df['Parch']

In [24]:
# 독립변수와 종속변수 준비
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']]
y = df['Survived']

In [25]:
X

Unnamed: 0,Pclass,Sex,Age,Fare,Embarked,FamilySize
0,3,1,22.000000,7.2500,2,1
1,1,0,38.000000,71.2833,0,1
2,3,0,26.000000,7.9250,2,0
3,1,0,35.000000,53.1000,2,1
4,3,1,35.000000,8.0500,2,0
...,...,...,...,...,...,...
886,2,1,27.000000,13.0000,2,0
887,1,0,19.000000,30.0000,2,0
888,3,0,29.699118,23.4500,2,3
889,1,1,26.000000,30.0000,0,0


In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 11)

In [27]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(712, 6)
(179, 6)
(712,)
(179,)


In [28]:
rf = RandomForestClassifier(n_estimators = 50, max_depth = 3, random_state = 20)  # n_estimators : 앙상블에 적용할 트리 갯수, max_depth : 각 결정트리의 최대 깊이
rf.fit(X_train, y_train)

In [29]:
pred = rf.predict(X_test)

In [30]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_test, pred)
print(acc)

0.8603351955307262


# III. 지도학습 - 회귀(예측)

> 분류는 예측값이 이산형, 회귀는 예측값이 연속형 숫자 값

## 1. 단순 선형 회귀분석을 이용한 예측 문제 해결

- 최소제곱법 : 잔차 제곱의 합이 최소가 되게 하는 회귀계수를 구하는 방법
- `y = ax + b` : a, b가 회귀계수
- 잔차 : 실제값과 예측한 값의 차이

In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/auto-mpg.csv')

In [3]:
df

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model-year
0,18.0,8,307.0,130.0,3504,12.0,70
1,15.0,8,350.0,165.0,3693,11.5,70
2,18.0,8,318.0,150.0,3436,11.0,70
3,16.0,8,304.0,150.0,3433,12.0,70
4,17.0,8,302.0,140.0,3449,10.5,70
...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790,15.6,82
394,44.0,4,97.0,52.0,2130,24.6,82
395,32.0,4,135.0,84.0,2295,11.6,82
396,28.0,4,120.0,79.0,2625,18.6,82


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    396 non-null    float64
 4   weight        398 non-null    int64  
 5   acceleration  398 non-null    float64
 6   model-year    398 non-null    int64  
dtypes: float64(4), int64(3)
memory usage: 21.9 KB


In [7]:
# 변수 간 상관분석, 0.7이 넘을 경우 강한 양의 상관관계, -0.7 강한 음의 상관관계
corr = df.corr(method = 'pearson')  # 피어슨 계수 활용
print(corr)

                   mpg  cylinders  displacement  horsepower    weight  \
mpg           1.000000  -0.775396     -0.804203   -0.777575 -0.831741   
cylinders    -0.775396   1.000000      0.950721    0.843751  0.896017   
displacement -0.804203   0.950721      1.000000    0.897787  0.932824   
horsepower   -0.777575   0.843751      0.897787    1.000000  0.864350   
weight       -0.831741   0.896017      0.932824    0.864350  1.000000   
acceleration  0.420289  -0.505419     -0.543684   -0.687241 -0.417457   
model-year    0.579267  -0.348746     -0.370164   -0.420697 -0.306564   

              acceleration  model-year  
mpg               0.420289    0.579267  
cylinders        -0.505419   -0.348746  
displacement     -0.543684   -0.370164  
horsepower       -0.687241   -0.420697  
weight           -0.417457   -0.306564  
acceleration      1.000000    0.288137  
model-year        0.288137    1.000000  


In [8]:
df = df.dropna(axis = 0)  # 결측 데이터 있는 행 제거

In [10]:
# 독립변수, 종속변수 설정
X = df[['weight']]
y = df['mpg']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(316, 1)
(80, 1)
(316,)
(80,)


In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [14]:
# 선형회귀 결과는 함수에 저장
print('기울기 a:', lr.coef_)  # lr.coef_ : 회귀식 기울기(a)
print('y절편 b:', lr.intercept_)  #lr.intercept_ : y절편

기울기 a: [-0.00774371]
y절편 b: 46.62501834798047


In [16]:
# 예측값
pred = lr.predict(X_test)

In [17]:
# 선형회귀분석의 평가는 결정계수인 R^2 점수로 정확도 판단
from sklearn.metrics import r2_score
score = r2_score(y_test, pred)
print(score)

0.7015633872576372


In [18]:
# train 데이터로 할 수도 있다
pred = lr.predict(X_train)

from sklearn.metrics import r2_score
score = r2_score(y_train, pred)
print(score)

0.6875735975346924


In [19]:
# horsepower와 mpg의 관계에 대해
import numpy as np
import pandas as pd
import sklearn

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/auto-mpg.csv')

df = df.dropna(axis = 0)

X = df[['horsepower']]
y = df['mpg']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)

lr = LinearRegression()
lr.fit(X_train, y_train)

print('기울기 a: ', lr.coef_)
print('y절편 b: ', lr.intercept_)

pred = lr.predict(X_test)

from sklearn.metrics import r2_score
score = r2_score(y_test, pred)
print(score)

기울기 a:  [-0.16035108]
y절편 b:  40.313418327064824
0.6039842414538836


## 2. 다중 선형 회귀분석을 이용한 예측 문제 해결

독립변수가 두 개 이상이고, 종속변수가 하나인 선형 회귀분석

- 주택 중위가치 영향 모델

In [20]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [21]:
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/housing.csv')

In [22]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [24]:
df = df.dropna(axis = 0)  # 결측치 행 제거

df = df.drop('ocean_proximity', axis = 1)  # 범주형 값 제거

In [25]:
corr = df.corr(method = 'pearson')
print(corr)

                    longitude  latitude  housing_median_age  total_rooms  \
longitude            1.000000 -0.924616           -0.109357     0.045480   
latitude            -0.924616  1.000000            0.011899    -0.036667   
housing_median_age  -0.109357  0.011899            1.000000    -0.360628   
total_rooms          0.045480 -0.036667           -0.360628     1.000000   
total_bedrooms       0.069608 -0.066983           -0.320451     0.930380   
population           0.100270 -0.108997           -0.295787     0.857281   
households           0.056513 -0.071774           -0.302768     0.918992   
median_income       -0.015550 -0.079626           -0.118278     0.197882   
median_house_value  -0.045398 -0.144638            0.106432     0.133294   

                    total_bedrooms  population  households  median_income  \
longitude                 0.069608    0.100270    0.056513      -0.015550   
latitude                 -0.066983   -0.108997   -0.071774      -0.079626   
housing_

In [28]:
X = df.drop('median_house_value', axis = 1)  # 종속변수는 중위 주택 가치를 제외한 나머지
y = df['median_house_value']  # 독립변수는 중위 주택 가치

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [30]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(14303, 8)
(6130, 8)
(14303,)
(6130,)


In [31]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [33]:
print('기울기:', lr.coef_)
print('y절편 :', lr.intercept_)

기울기: [-4.21262308e+04 -4.20623763e+04  1.18784999e+03 -8.57874086e+00
  1.18123421e+02 -3.55751755e+01  3.73676747e+01  4.03297253e+04]
y절편 : -3530241.307796566


In [34]:
pred = lr.predict(X_test)

In [35]:
from sklearn.metrics import r2_score
score = r2_score(y_test, pred)
print(score)

0.6445130291082337


In [37]:
pred = lr.predict(X_train)

from sklearn.metrics import r2_score
score = r2_score(y_train, pred)
print(score)

0.6334125389213838


## 3. 의사결정나무를 이용한 예측 문제 해결

> 각 항목에서의 범주를 예측하는 것이 아닌 어떠한 값 자체를 예측하는 것
- MSE(평균제곱오차) : 오차의 제곱에 대해 평균을 취한 값으로 예측값에 대한 정확성을 측정

In [38]:
import numpy as np
import pandas as pd
import sklearn

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [39]:
df = pd.read_csv('https://raw.githubusercontent.com/YoungjinBD/dataset/main/housing.csv')

In [40]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


In [42]:
df = df.dropna(axis = 0)  # 결측값 있는 행 제거

df = df.drop('ocean_proximity', axis = 1)  # 범주형 값 제거

In [43]:
# 상관분석
corr = df.corr(method = 'pearson')
print(corr)

                    longitude  latitude  housing_median_age  total_rooms  \
longitude            1.000000 -0.924616           -0.109357     0.045480   
latitude            -0.924616  1.000000            0.011899    -0.036667   
housing_median_age  -0.109357  0.011899            1.000000    -0.360628   
total_rooms          0.045480 -0.036667           -0.360628     1.000000   
total_bedrooms       0.069608 -0.066983           -0.320451     0.930380   
population           0.100270 -0.108997           -0.295787     0.857281   
households           0.056513 -0.071774           -0.302768     0.918992   
median_income       -0.015550 -0.079626           -0.118278     0.197882   
median_house_value  -0.045398 -0.144638            0.106432     0.133294   

                    total_bedrooms  population  households  median_income  \
longitude                 0.069608    0.100270    0.056513      -0.015550   
latitude                 -0.066983   -0.108997   -0.071774      -0.079626   
housing_

In [44]:
X = df.drop('median_house_value', axis = 1)
y = df['median_house_value']

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [47]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(14303, 8)
(6130, 8)
(14303,)
(6130,)


In [48]:
dtr = DecisionTreeRegressor(max_depth = 3, random_state = 42)
dtr.fit(X_train, y_train)

In [49]:
pred = dtr.predict(X_test)

In [50]:
# MSE를 통해 성능평가
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, pred)
print(mse)

6793101269.876856


In [52]:
# train 데이터로도 성능 평가
pred = dtr.predict(X_train)

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_train, pred)
print(mse)

6684086804.552605


## 4. 랜덤 포레스트를 이용한 예측 문제 해결