In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


# 개요
[참고한 사이트](https://www.kaggle.com/code/heyrobin/spaceship-titanic-a-cosmic-mystery) SpaceTitanic Kaggle Competition 링크 글을 참고하여 코드를 따라쓰며 필자가 해석한 부분을 설명하면서 머신러닝에 대해 이해하기 위한 목적으로 작성하게 되었습니다.

<strong>Data</strong> - 승객의 약 3분의 2(~8700)에 대한 개인 기록은 훈련 데이터로 사용됩니다.
<ul><li><code>PassengerId</code> - 각 승객의 고유 ID. 각 ID는 <code>gggg_pp</code> 형식을 취하며, 여기서 <code>gggg</code>는 승객이 함께 여행하는 그룹을 나타내며, <code>pp</code>는 그룹 내 번호를 나타냅니다. 그룹의 사람들은 대게 가족 구성원이지만 항상 그렇만은 않습니다.</li>
<li><code>HomePlanet</code> - 승객이 출발한 행성으로, 일반적으로 그들의 영구 거주지 행성입니다.</li>
<li><code>CryoSleep</code> - 객이 항해가 진행되는 동안 가사상태로 전환되도록 선택했는지 여부를 나타냅니다. 저온 수면 중인 승객들은 객실에 있게 된다.</li>
<li><code>Cabin</code> - 승객이 머물고 있는 객실 번호입니다. 갑판/번호/측면<code>deck/num/side</code>형식을 취합니다. 여기서 측면<code>side</code>은 좌측이 <code>P</code>, <em>Port</em> 또는 우측이 <code>S</code>, <em>Starboard</em>가 될 수 있습니다.</li>
<li><code>Destination</code> - 승객이 정박할 목적지 행성입니다.</li>
<li><code>Age</code> - 승객의 나이를 나타냅니다.</li>
<li><code>VIP</code> - 승객이 항해 중 특별 VIP 서비스를 위해 비용을 지불했는지 여부를 나타낸 것 입니다.</li>
<li><code>RoomService</code>, <code>FoodCourt</code>, <code>ShoppingMall</code>, <code>Spa</code>, <code>VRDeck</code> - 승객들이 <em>Spaceship Titanic</em>의 많은 고급 편의시설에서 청구한 금액입니다. </li>
<li><code>Name</code> - 승객들의 이름을 나타냅니다.</li>
<li><code>Transported</code> - 승객이 다른 차원으로 이송되었는지 여부. 예측하려는 대상 열입니다.</li></ul></li>

In [2]:
'''Analysis(분석)'''

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.offline as py
import plotly.graph_objs as go
import matplotlib.pyplot as plt

'''Pre-Processing(데이터 전처리)'''
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

'''Model-Metrics(모형 개발 및 평가)'''
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import warnings
warnings.filterwarnings('ignore')

'''Dataset'''
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')

- 라이브러리 버전 확인

In [3]:
print(np.__version__)
print(pd.__version__)
print(sns.__version__)

1.20.3
1.3.5
0.11.2


- 데이터 확인

In [4]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


- 컬럼별 레이블의 개수 확인
    + data_Frame.nunique() = 데이터셋에 존재하는 컬럼별 레이블의 개수를 확인하기 위해 사용

In [6]:
train.nunique()

PassengerId     8693
HomePlanet         3
CryoSleep          2
Cabin           6560
Destination        3
Age               80
VIP                2
RoomService     1273
FoodCourt       1507
ShoppingMall    1115
Spa             1327
VRDeck          1306
Name            8473
Transported        2
dtype: int64

# 분석 데이터 탐색

- HomePlanet

In [7]:
cnt_srs = train['HomePlanet'].value_counts()

trace = go.Scatter(
    x = cnt_srs.index,
    y = cnt_srs.values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        sizeref = 20,
        size = cnt_srs.values,
        color = ['#1D7595','#B9B596','#864D29']

    ),
)

layout = go.Layout(
    title = '<b>Home Planets</b>', 
    title_x = 0.5,
    titlefont = dict(size =20, color='black', family='Space Mono'),
    plot_bgcolor = 'rgba(0,0,0,0)'
)

data = [trace]
fig = go.Figure(data = data, layout = layout)
fig.update_xaxes(showline = True, linewidth = 2,
                 #linecolor = colors[3]
                )
fig.update_yaxes(showline = True, linewidth = 2,
                 #linecolor = colors[3]
                )
py.iplot(fig, filename = "Size of the Company/Clan")

    + Earth: 4602
    + Europa: 2131
    + Mars: 1759

- Destination

In [8]:
cnt_srs = train['Destination'].value_counts()

trace = go.Scatter(
    x = cnt_srs.index,
    y = cnt_srs.values,
    mode = 'markers',
    marker = dict(
        sizemode = 'diameter',
        sizeref = 20,
        size = cnt_srs.values,
        color = ['#048B95','#A1231F','#602F58']
    ),
)



layout = go.Layout(
    title = '<b>Destination</b>', 
    title_x = 0.5,
    titlefont = dict(size = 20, color = 'black', family = 'Space Mono'),
    
    plot_bgcolor = 'rgba(0,0,0,0)'
)

fig = go.Figure(data = [trace], layout = layout)

sources = "https://assets.stickpng.com/images/580b585b2edbce24c47b2d2a.png"
# add images

fig.add_layout_image(
        source=sources,
        xref = "x domain",
        yref = "y domain",
        x = 0.2,
        y = 0.8,
        xanchor = "right",
        yanchor = "top",
        sizex = 1,
        sizey = 0.7,
        opacity = False
    )

fig.update_xaxes(showline=True, linewidth=2)
fig.update_yaxes(showline=True, linewidth=2)

    + TRAPPIST-1e: 5915
    + 55 Cancri e: 1800
    + PSO J318.5-22: 796

- Age Distribution

In [9]:
trace = go.Histogram(x = train['Age'],
                            histnorm = 'percent',
                            xbins = dict(
                                start = 0,
                                end = 100,
                                size = 1),
                            marker_color = '#048B95',
                            )


layout = go.Layout(
    title = '<b>Age Distribution</b>',
    title_x = 0.5,
    titlefont = dict(size = 20, color = 'black', family = 'Space Mono'),
    plot_bgcolor = 'rgba(0, 0, 0, 0)',
    xaxis_title = 'Age',
    yaxis_title = 'Percent'
)


fig = go.Figure(data = [trace], layout = layout)

fig.update_xaxes(showline = True, linewidth = 2)
fig.update_yaxes(showline = True, linewidth = 2)

py.iplot(fig, filename = "planets")

In [10]:
# taking target variable and droping it
y = train['Transported']
train = train.drop('Transported', axis = 1)

- 필요없는 특성 정리
    + drop(inplce = True) : 기존 프레임에 변경된 설정으로 덮어쓰겠다는 의미, 그렇기 때문에 다시 데이터 프레임을 가져왔을때 출력되지 않음. [참고링크](https://www.dinolabs.ai/70)

In [11]:
# 관계없는 특성 지우기

#cols_to_drop = ['Name', 'PassengerId']
cols_to_drop = ['Name', 'PassengerId', 'Cabin']
def drop(train,test,cols):
    train.drop(cols, axis = 1, inplace = True)
    test.drop(cols, axis = 1, inplace = True)
    print('sucessfully droped features')
    
drop(train, test, cols_to_drop)

sucessfully droped features
