In [84]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [85]:
launch_data = pd.read_csv('data/RocketLaunchDataCSV.csv')
launch_data.head()

Unnamed: 0,Name,Date,Time (East Coast),Location,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Temp at Launch Time,...,Max Wind Speed,Visibility,Wind Speed at Launch Time,Hist Ave Max Wind Speed,Hist Ave Visibility,Sea Level Pressure,Hist Ave Sea Level Pressure,Day Length,Condition,Notes
0,,04-Dec-58,,Cape Canaveral,,,75.0,68.0,71.0,,...,16.0,15.0,,,,30.22,,10:26,Cloudy,
1,,05-Dec-58,,Cape Canaveral,,,78.0,70.0,73.39,,...,14.0,10.0,,,,30.2,,10:26,Cloudy,
2,Pioneer 3,06-Dec-58,1:45,Cape Canaveral,Uncrewed,Y,73.0,0.0,60.21,62.0,...,15.0,10.0,11.0,,,30.25,,10:25,Cloudy,
3,,07-Dec-58,,Cape Canaveral,,,76.0,57.0,66.04,,...,10.0,10.0,,,,30.28,,10:25,Partly Cloudy,
4,,08-Dec-58,,Cape Canaveral,,,79.0,60.0,70.52,,...,12.0,10.0,,,,30.23,,12:24,Partly Cloudy,


In [86]:
launch_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 26 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Name                          60 non-null     object 
 1   Date                          300 non-null    object 
 2   Time (East Coast)             59 non-null     object 
 3   Location                      300 non-null    object 
 4   Crewed or Uncrewed            60 non-null     object 
 5   Launched?                     60 non-null     object 
 6   High Temp                     299 non-null    float64
 7   Low Temp                      299 non-null    float64
 8   Ave Temp                      299 non-null    float64
 9   Temp at Launch Time           59 non-null     float64
 10  Hist High Temp                299 non-null    float64
 11  Hist Low Temp                 299 non-null    float64
 12  Hist Ave Temp                 299 non-null    float64
 13  Perci

In [87]:
launch_data['Launched?'].value_counts()
launch_data_1 = launch_data.copy()

### 의사 결정 나무

In [88]:
# 타겟 추출
dfy = launch_data_1['Launched?'].copy()
dfy

0      NaN
1      NaN
2        Y
3      NaN
4      NaN
      ... 
295    NaN
296    NaN
297      Y
298    NaN
299    NaN
Name: Launched?, Length: 300, dtype: object

In [89]:
dfy.fillna('N', inplace=True)
dfy

0      N
1      N
2      Y
3      N
4      N
      ..
295    N
296    N
297    Y
298    N
299    N
Name: Launched?, Length: 300, dtype: object

In [90]:
# 피쳐 추출
feature_list = ['High Temp','Low Temp','Ave Temp','Temp at Launch Time','Max Wind Speed','Visibility','Wind Speed at Launch Time']

dfx = launch_data_1[feature_list].copy()
dfx

Unnamed: 0,High Temp,Low Temp,Ave Temp,Temp at Launch Time,Max Wind Speed,Visibility,Wind Speed at Launch Time
0,75.0,68.0,71.00,,16.0,15.0,
1,78.0,70.0,73.39,,14.0,10.0,
2,73.0,0.0,60.21,62.0,15.0,10.0,11.0
3,76.0,57.0,66.04,,10.0,10.0,
4,79.0,60.0,70.52,,12.0,10.0,
...,...,...,...,...,...,...,...
295,87.0,71.0,79.36,,20.0,10.0,
296,86.0,72.0,79.50,,18.0,10.0,
297,87.0,75.0,79.69,80.0,16.0,10.0,7.0
298,87.0,72.0,79.70,,13.0,10.0,


In [91]:
dfx['Temp at Launch Time'] = dfx['Temp at Launch Time'].fillna(dfx['Temp at Launch Time'].mean())
dfx['Wind Speed at Launch Time'] = dfx['Wind Speed at Launch Time'].fillna(dfx['Wind Speed at Launch Time'].mean())
dfx.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 7 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   High Temp                  299 non-null    float64
 1   Low Temp                   299 non-null    float64
 2   Ave Temp                   299 non-null    float64
 3   Temp at Launch Time        300 non-null    float64
 4   Max Wind Speed             299 non-null    float64
 5   Visibility                 299 non-null    float64
 6   Wind Speed at Launch Time  300 non-null    float64
dtypes: float64(7)
memory usage: 16.5 KB


In [100]:
# 타겟 변수 인코딩
from sklearn.preprocessing import LabelEncoder
label_enc = LabelEncoder()
dfy = label_enc.fit_transform(dfy)
dfy

array([0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0])

In [101]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dfx, dfy, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((240, 7), (60, 7), (240,), (60,))

In [102]:
dfx['High Temp'] = dfx['High Temp'].fillna(dfx['High Temp'].mean())
dfx['Low Temp'] = dfx['Low Temp'].fillna(dfx['Low Temp'].mean())
dfx['Ave Temp'] = dfx['Ave Temp'].fillna(dfx['Ave Temp'].mean())
dfx['Max Wind Speed'] = dfx['Max Wind Speed'].fillna(dfx['Max Wind Speed'].mean())
dfx['Visibility'] = dfx['Visibility'].fillna(dfx['Visibility'].mean())

In [92]:
launch_data['Launched?'] = launch_data['Launched?'].fillna('N')
launch_data['Crewed or Uncrewed'] = launch_data['Crewed or Uncrewed'].fillna('Uncrewed')
launch_data['Wind Direction'] = launch_data['Wind Direction'].fillna('Unknown')
launch_data['Condition'] = launch_data['Condition'].fillna('Fair')
launch_data.fillna(0, inplace=True)
launch_data.head()

Unnamed: 0,Name,Date,Time (East Coast),Location,Crewed or Uncrewed,Launched?,High Temp,Low Temp,Ave Temp,Temp at Launch Time,...,Max Wind Speed,Visibility,Wind Speed at Launch Time,Hist Ave Max Wind Speed,Hist Ave Visibility,Sea Level Pressure,Hist Ave Sea Level Pressure,Day Length,Condition,Notes
0,0,04-Dec-58,0,Cape Canaveral,Uncrewed,N,75.0,68.0,71.0,0.0,...,16.0,15.0,0.0,0.0,0.0,30.22,0.0,10:26,Cloudy,0
1,0,05-Dec-58,0,Cape Canaveral,Uncrewed,N,78.0,70.0,73.39,0.0,...,14.0,10.0,0.0,0.0,0.0,30.2,0.0,10:26,Cloudy,0
2,Pioneer 3,06-Dec-58,1:45,Cape Canaveral,Uncrewed,Y,73.0,0.0,60.21,62.0,...,15.0,10.0,11.0,0.0,0.0,30.25,0.0,10:25,Cloudy,0
3,0,07-Dec-58,0,Cape Canaveral,Uncrewed,N,76.0,57.0,66.04,0.0,...,10.0,10.0,0.0,0.0,0.0,30.28,0.0,10:25,Partly Cloudy,0
4,0,08-Dec-58,0,Cape Canaveral,Uncrewed,N,79.0,60.0,70.52,0.0,...,12.0,10.0,0.0,0.0,0.0,30.23,0.0,12:24,Partly Cloudy,0


In [93]:
from sklearn import preprocessing

# 머신러닝 모델의 입력으로 숫자 데이터를 사용해야 하므로 모든 텍스트를 숫자로 변환한다. 예를 들어 유인 로켓이면 1, 무인 로켓이면 0
label_encoder = preprocessing.LabelEncoder()

# 카테고리를 나타내는 텍스트를 가진 아래 3개의 컬럼을 숫자 데이터를 가지도록 변환한다.
launch_data['Crewed or Uncrewed'] = label_encoder.fit_transform(launch_data['Crewed or Uncrewed'])
launch_data['Wind Direction'] = label_encoder.fit_transform(launch_data['Wind Direction'])
launch_data['Condition'] = label_encoder.fit_transform(launch_data['Condition'])

In [94]:
# 머신러닝 모델의 출력으로 우리가 예측하고 싶은 것은 'Launched?' 컬럼이다. 이것을 출력 변수에 할당한다.
y = launch_data['Launched?']

In [95]:
y

0      N
1      N
2      Y
3      N
4      N
      ..
295    N
296    N
297    Y
298    N
299    N
Name: Launched?, Length: 300, dtype: object

In [96]:
launch_data.columns

Index(['Name', 'Date', 'Time (East Coast)', 'Location', 'Crewed or Uncrewed',
       'Launched?', 'High Temp', 'Low Temp', 'Ave Temp', 'Temp at Launch Time',
       'Hist High Temp', 'Hist Low Temp', 'Hist Ave Temp',
       'Percipitation at Launch Time', 'Hist Ave Percipitation',
       'Wind Direction', 'Max Wind Speed', 'Visibility',
       'Wind Speed at Launch Time', 'Hist Ave Max Wind Speed',
       'Hist Ave Visibility', 'Sea Level Pressure',
       'Hist Ave Sea Level Pressure', 'Day Length', 'Condition', 'Notes'],
      dtype='object')

In [97]:
# 머신러닝 모델의 입력값으로 컬럼을 선택한다. 선택한 컬럼들이 feature이다.
launch_data.drop(['Name', 'Date', 'Time (East Coast)', 'Location', 'Launched?', 'Hist Ave Max Wind Speed', 
                  'Hist Ave Visibility', 'Sea Level Pressure', 'Hist Ave Sea Level Pressure', 'Day Length'], axis=1, inplace=True)
x = launch_data

In [98]:
x

Unnamed: 0,Crewed or Uncrewed,High Temp,Low Temp,Ave Temp,Temp at Launch Time,Hist High Temp,Hist Low Temp,Hist Ave Temp,Percipitation at Launch Time,Hist Ave Percipitation,Wind Direction,Max Wind Speed,Visibility,Wind Speed at Launch Time,Condition,Notes
0,1,75.0,68.0,71.00,0.0,75.0,55.0,65.0,0.00,0.08,0,16.0,15.0,0.0,0,0
1,1,78.0,70.0,73.39,0.0,75.0,55.0,65.0,0.00,0.09,0,14.0,10.0,0.0,0,0
2,1,73.0,0.0,60.21,62.0,75.0,55.0,65.0,0.00,0.09,2,15.0,10.0,11.0,0,0
3,1,76.0,57.0,66.04,0.0,75.0,55.0,65.0,0.00,0.08,1,10.0,10.0,0.0,6,0
4,1,79.0,60.0,70.52,0.0,75.0,55.0,65.0,0.00,0.09,0,12.0,10.0,0.0,6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1,87.0,71.0,79.36,0.0,87.0,70.0,79.0,0.00,0.15,4,20.0,10.0,0.0,9,0
296,1,86.0,72.0,79.50,0.0,88.0,70.0,79.0,0.01,0.16,5,18.0,10.0,0.0,1,0
297,0,87.0,75.0,79.69,80.0,88.0,70.0,79.0,0.00,0.16,5,16.0,10.0,7.0,1,Rain before launch
298,1,87.0,72.0,79.70,0.0,88.0,70.0,79.0,0.46,0.16,0,13.0,10.0,0.0,0,0


# 로지스틱 회귀

In [99]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
launch_data['Launched?'] = label_enc.fit_transform(launch_data['Launched?'])
launch_data.head()

KeyError: 'Launched?'

In [None]:
# 특성과 타겟으로 분리
x = launch_data.drop(['Name', 'Date', 'Time (East Coast)', 'Location', 'Launched?', 'Hist Ave Max Wind Speed', 
                  'Hist Ave Visibility', 'Sea Level Pressure', 'Hist Ave Sea Level Pressure', 'Day Length'], axis=1)
y = launch_data['Launched?']
x.shape, y.shape

((300, 16), (300,))

In [None]:
cor_matrix = launch_data.corr()
cor_matrix

ValueError: could not convert string to float: 'Pioneer 3'

## 4. 모델 학습
### 의사결정나무

In [None]:
# Machine learning libraries used to build a decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [None]:
from sklearn import linear_model, model_selection, metrics
from sklearn.model_selection import train_test_split

In [None]:
# 데이터를 학습 데이터와 테스트 데이터로 분할한다.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
len(x_train), len(x_test), len(y_train), len(y_test)

(240, 60, 240, 60)

In [None]:
# 학습 데이터로 머신러닝 모델 학습 시키기
tree_model.fit(x_train, y_train)

NameError: name 'tree_model' is not defined

# 랜덤포레스트