## 프로젝트 제목 
# "Spaceship Titanic: A Machine Learning Approach to Predict Interdimensional Transport"

## 프로젝트 설명
#### 우주선 Titanic을 탄 승객들은 사고에 의해서 다른 차원으로 이동했다. 이때, 어떤 승객이 다른 차원으로 이동했는지 예측해야 한다.

## 데이터셋
- 데이터 출처 링크 Kaggle : https://www.kaggle.com/competitions/spaceship-titanic
- 데이터 칼럼 정보

| Column | Description | Data Type | Example |
| ------ | ----------- | --------- | ------- |
| PassengerId | A unique Id for each passenger. | object | 0001_01 |
| CryoSleep | Indicates if the passenger was in suspended animation during the voyage. | int64 | 0 |
| VIP | Whether the passenger has paid for special VIP service. | int64 | 0 |
| Transported | Whether the passenger was transported to another dimension (target variable). | bool | False |
| HomePlanet_Earth | The planet the passenger departed from - Earth. | float64 | 0.0 |
| HomePlanet_Europa | The planet the passenger departed from - Europa. | float64 | 1.0 |
| HomePlanet_Mars | The planet the passenger departed from - Mars. | float64 | 0.0 |
| Destination_55 Cancri e | The destination planet of the passenger - 55 Cancri e. | float64 | 0.0 |
| Destination_PSO J318.5-22 | The destination planet of the passenger - PSO J318.5-22. | float64 | 0.0 |
| Destination_TRAPPIST-1e | The destination planet of the passenger - TRAPPIST-1e. | float64 | 1.0 |
| Age | The age of the passenger. | float64 | 0.493671 |
| RoomService | Amount billed for room service. | float64 | 0.0 |
| FoodCourt | Amount billed at the food court. | float64 | 0.0 |
| ShoppingMall | Amount billed at the shopping mall. | float64 | 0.0 |
| Spa | Amount billed at the spa. | float64 | 0.0 |
| VRDeck | Amount billed at the VR deck. | float64 | 0.0 |
| Deck | Cabin deck information. | object | B |
| Num | Cabin number. | object | 0 |
| Side | Cabin side (P for Port or S for Starboard). | object | P |
| Group | Group information from PassengerId. | object | 0001 |


In [198]:
import pandas as pd
import numpy as np
import os
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

pd.set_option("display.max_rows", 200)

In [199]:
DATA_DIR = 'data'
datalist = []
for filename in os.listdir(DATA_DIR)[::-1]:
    if filename.endswith('csv'):
        filepath = os.path.join(DATA_DIR, filename)
        df = pd.read_csv(filepath)
        datalist.append(df)
df_all = pd.concat(datalist)

number_of_train_dataset = datalist[0].shape[0]
y_value = df_all.pop('Transported').values
y_value = y_value[:number_of_train_dataset]
df_all = df_all.drop("PassengerId", axis = 1)
df_all = df_all.reset_index(drop = True)
df_all.head(3).T

Unnamed: 0,0,1,2
HomePlanet,Europa,Earth,Europa
CryoSleep,False,False,False
Cabin,B/0/P,F/0/S,A/0/S
Destination,TRAPPIST-1e,TRAPPIST-1e,TRAPPIST-1e
Age,39.0,24.0,58.0
VIP,False,False,True
RoomService,0.0,109.0,43.0
FoodCourt,0.0,9.0,3576.0
ShoppingMall,0.0,25.0,0.0
Spa,0.0,549.0,6715.0


In [200]:
y_value = y_value[:number_of_train_dataset]
df_h = df_all[:number_of_train_dataset]
df_h['Y'] = y_value.astype(int)

df_sleep = df_h[['CryoSleep', 'Y']]
df_sleep = df_sleep.dropna()
df_sleep['CryoSleep'] = df_sleep['CryoSleep'].astype(int)
df_sleep.corr()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h['Y'] = y_value.astype(int)


Unnamed: 0,CryoSleep,Y
CryoSleep,1.0,0.468645
Y,0.468645,1.0


In [201]:
df_h

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Y
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0
8689,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0
8690,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1
8691,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0


## 가설
***
#### a. 짐에 대한 정보는 큰 영향을 미칠 것이다.

In [202]:
# 데이터 전처리 및 상관관계 분석
y_value = y_value[:number_of_train_dataset]
df_h = df_all[:number_of_train_dataset]
df_h['Y'] = y_value.astype(int)

df_cabin = df_h[['Cabin', 'Y']]
df_cabin = df_cabin.dropna()
df_cabin_split = df_cabin['Cabin'].str.split('/', expand=True).rename(columns={0:'Deck', 1:'Num', 2:'Side'})
df_cabin = pd.merge(df_cabin, df_cabin_split, right_index=True, left_index=True)
df_cabin = df_cabin.drop('Cabin', axis=1)
df_cabin['Side'] = df_cabin['Side'].map({'P':1, 'S':0})
df_cabin = df_cabin.drop('Num', axis=1)

df_cabin = pd.merge(df_cabin, pd.get_dummies(df_cabin['Deck']).astype(int), right_index=True, left_index=True)
df_cabin = df_cabin.drop('Deck', axis=1)

df_cabin.corr()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h['Y'] = y_value.astype(int)


Unnamed: 0,Y,Side,A,B,C,D,E,F,G,T
Y,1.0,-0.103775,-0.002664,0.146571,0.109558,-0.034479,-0.099254,-0.089308,0.016505,-0.014739
Side,-0.103775,1.0,-0.013448,-0.026711,-0.024023,0.011554,-0.003694,0.027308,0.004543,0.014797
A,-0.002664,-0.013448,1.0,-0.056016,-0.05474,-0.043047,-0.059778,-0.12342,-0.115753,-0.004278
B,0.146571,-0.026711,-0.056016,1.0,-0.098672,-0.077595,-0.107754,-0.222473,-0.208653,-0.007712
C,0.109558,-0.024023,-0.05474,-0.098672,1.0,-0.075828,-0.105299,-0.217405,-0.203901,-0.007536
D,-0.034479,0.011554,-0.043047,-0.077595,-0.075828,1.0,-0.082807,-0.170966,-0.160347,-0.005926
E,-0.099254,-0.003694,-0.059778,-0.107754,-0.105299,-0.082807,1.0,-0.237415,-0.222667,-0.00823
F,-0.089308,0.027308,-0.12342,-0.222473,-0.217405,-0.170966,-0.237415,1.0,-0.459728,-0.016992
G,0.016505,0.004543,-0.115753,-0.208653,-0.203901,-0.160347,-0.222667,-0.459728,1.0,-0.015936
T,-0.014739,0.014797,-0.004278,-0.007712,-0.007536,-0.005926,-0.00823,-0.016992,-0.015936,1.0


In [203]:
df_cabin

Unnamed: 0,Y,Side,A,B,C,D,E,F,G,T
0,0,1,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
8688,0,1,1,0,0,0,0,0,0,0
8689,0,0,0,0,0,0,0,0,1,0
8690,1,0,0,0,0,0,0,0,1,0
8691,0,0,0,0,0,0,1,0,0,0


In [204]:
# 정확도 예측
X = df_cabin[["Side", "A", "B", "C", "D", "E", "F", "G", "T"]].values
y = df_cabin[["Y"]].values

clf = LogisticRegression(random_state=0).fit(X, y)
print(accuracy_score(clf.predict(X), y))

0.5978337650105957


  y = column_or_1d(y, warn=True)


In [205]:
# Test Y예측
df_target = df_all[number_of_train_dataset:]
df_cabin = df_target[['Cabin']]
df_cabin = df_cabin.dropna()
df_cabin_split = df_cabin['Cabin'].str.split('/', expand=True).rename(columns={0:'Deck', 1:'Num', 2:'Side'})
df_cabin = pd.merge(df_cabin, df_cabin_split, right_index=True, left_index=True)
df_cabin = df_cabin.drop('Cabin', axis=1)
df_cabin['Side'] = df_cabin['Side'].map({'P':1, 'S':0})
df_cabin = df_cabin.drop('Num', axis=1)

df_cabin = pd.merge(df_cabin, pd.get_dummies(df_cabin['Deck']).astype(int), right_index=True, left_index=True)
df_cabin = df_cabin.drop('Deck', axis=1)

X = df_cabin[["Side", "A", "B", "C", "D", "E", "F", "G", "T"]].values
clf.predict(X)

array([1, 0, 1, ..., 0, 0, 1])

#### b. HomePlanet이나 Destination이 큰 영향을 미칠 것이다.

In [206]:
# 데이터 전처리 및 상관관계 분석
y_value = y_value[:number_of_train_dataset]
df_h = df_all[:number_of_train_dataset]
df_h['Y'] = y_value.astype(int)

df_location = df_h[['HomePlanet', 'Destination', 'Y']]
df_location = pd.merge(df_location, pd.get_dummies(df_location[['HomePlanet', 'Destination']]), right_index=True, left_index=True)
df_location = df_location.drop(['HomePlanet', 'Destination'], axis=1)
df_location.corr()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h['Y'] = y_value.astype(int)


Unnamed: 0,Y,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
Y,1.0,-0.169019,0.176916,0.019544,0.108722,9.2e-05,-0.0947
HomePlanet_Earth,-0.169019,1.0,-0.604411,-0.534195,-0.149534,0.232218,-0.015
HomePlanet_Europa,0.176916,-0.604411,1.0,-0.287022,0.293517,-0.163308,-0.149679
HomePlanet_Mars,0.019544,-0.534195,-0.287022,1.0,-0.120996,-0.11126,0.170778
Destination_55 Cancri e,0.108722,-0.149534,0.293517,-0.120996,1.0,-0.16224,-0.745664
Destination_PSO J318.5-22,9.2e-05,0.232218,-0.163308,-0.11126,-0.16224,1.0,-0.463273
Destination_TRAPPIST-1e,-0.0947,-0.015,-0.149679,0.170778,-0.745664,-0.463273,1.0


In [207]:
df_location

Unnamed: 0,Y,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e
0,0,False,True,False,False,False,True
1,1,True,False,False,False,False,True
2,0,False,True,False,False,False,True
3,0,False,True,False,False,False,True
4,1,True,False,False,False,False,True
...,...,...,...,...,...,...,...
8688,0,False,True,False,True,False,False
8689,0,True,False,False,False,True,False
8690,1,True,False,False,False,False,True
8691,0,False,True,False,True,False,False


In [208]:
# 정확도 예측
X = df_location[['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']].values
y = df_location[["Y"]].values

clf = LogisticRegression(random_state=0).fit(X, y)
print(accuracy_score(clf.predict(X), y))

0.5852985160473945


  y = column_or_1d(y, warn=True)


In [209]:
# Test Y예측
df_location = df_target[['HomePlanet', 'Destination']]
df_location = pd.merge(df_location, pd.get_dummies(df_location[['HomePlanet', 'Destination']]), right_index=True, left_index=True)
df_location = df_location.drop(['HomePlanet', 'Destination'], axis=1)

X = df_location[['HomePlanet_Earth', 'HomePlanet_Europa', 'HomePlanet_Mars', 'Destination_55 Cancri e', 'Destination_PSO J318.5-22', 'Destination_TRAPPIST-1e']].values
clf.predict(X)

array([0, 0, 1, ..., 1, 1, 0])

#### c. 지불금액이 큰 영향을 미칠 것이다.

In [210]:
# 데이터 전처리 및 상관관계 분석
y_value = y_value[:number_of_train_dataset]
df_h = df_all[:number_of_train_dataset]
df_h['Y'] = y_value.astype(int)
df_billed = df_h[['VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Y']]

for i in df_billed:
    df_billed[i] = df_billed[i].fillna(0)

df_billed['VIP'] = df_billed['VIP'].astype(int)
df_billed.corr()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_h['Y'] = y_value.astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_billed[i] = df_billed[i].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_billed['VIP'] = df_billed['VIP'].astype(int)


Unnamed: 0,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Y
VIP,1.0,0.056566,0.125499,0.018412,0.060991,0.123061,-0.037261
RoomService,0.056566,1.0,-0.015126,0.052337,0.009244,-0.018624,-0.241124
FoodCourt,0.125499,-0.015126,1.0,-0.013717,0.221468,0.224572,0.045583
ShoppingMall,0.018412,0.052337,-0.013717,1.0,0.014542,-0.007849,0.009391
Spa,0.060991,0.009244,0.221468,0.014542,1.0,0.147658,-0.218545
VRDeck,0.123061,-0.018624,0.224572,-0.007849,0.147658,1.0,-0.204874
Y,-0.037261,-0.241124,0.045583,0.009391,-0.218545,-0.204874,1.0


In [211]:
df_billed

Unnamed: 0,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Y
0,0,0.0,0.0,0.0,0.0,0.0,0
1,0,109.0,9.0,25.0,549.0,44.0,1
2,1,43.0,3576.0,0.0,6715.0,49.0,0
3,0,0.0,1283.0,371.0,3329.0,193.0,0
4,0,303.0,70.0,151.0,565.0,2.0,1
...,...,...,...,...,...,...,...
8688,1,0.0,6819.0,0.0,1643.0,74.0,0
8689,0,0.0,0.0,0.0,0.0,0.0,0
8690,0,0.0,0.0,1872.0,1.0,0.0,1
8691,0,0.0,1049.0,0.0,353.0,3235.0,0


In [212]:
# 정확도 예측
X = df_billed[['VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].values
y = df_billed[["Y"]].values

clf = LogisticRegression(random_state=0).fit(X, y)
print(accuracy_score(clf.predict(X), y))

0.7751064074542735


  y = column_or_1d(y, warn=True)


In [214]:
# Test Y예측
df_billed = df_target[['VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']]

for i in df_billed:
    df_billed[i] = df_billed[i].fillna(0)

df_billed['VIP'] = df_billed['VIP'].astype(int)

X = df_billed[['VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].values
clf.predict(X)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_billed[i] = df_billed[i].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_billed['VIP'] = df_billed['VIP'].astype(int)


array([1, 0, 1, ..., 1, 1, 1])

## 결과 및 해석
- 가설 a. "짐에 대한 정보는 큰 영향을 미칠 것이다."
    * 정확도: 0.5978337650105957
- 가설 b. "HomePlanet이나 Destination이 큰 영향을 미칠 것이다."
    * 정확도: 0.5852985160473945
- 가설 c. "지불금액이 큰 영향을 미칠 것이다."
    * 정확도: 0.7751064074542735

### 따라서 지불금액이 큰 영향을 미친다.