In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/spaceship-titanic/sample_submission.csv
/kaggle/input/spaceship-titanic/train.csv
/kaggle/input/spaceship-titanic/test.csv


# **Importing the data**

In [2]:
train_data = pd.read_csv("/kaggle/input/spaceship-titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/spaceship-titanic/test.csv")
full_data = pd.concat([train_data,test_data])

# getting some information

In [3]:
train_data.describe()
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


we are trying to predict on transported, so I will make this column binary

In [5]:
train_data["Transported"] = train_data["Transported"].astype("int")
avgtransported = sum(train_data["Transported"])/len(train_data["Transported"])


In [6]:
print(train_data.isnull().sum())


PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64


In [7]:
train_data.describe(include = ["O"])


Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name
count,8693,8492,8476,8494,8511,8490,8493
unique,8693,3,2,6560,3,2,8473
top,0001_01,Earth,False,G/734/S,TRAPPIST-1e,False,Gollux Reedall
freq,1,4602,5439,8,5915,8291,2


**So far conclusion** 
* a lot of the data missing
* 50% of the passengers were transported
* More than 75% of the passengers are under the age of 38 and there some passengers are over 70 years old.
* More than 50% of the passengers didn't spend any money for RoomService, FoodCourt, ShoppingMall, Spa, VRDeck.
* there are too high outliers in RoomService, FoodCourt, ShoppingMall, Spa, VRDeck.
* Earth is the most common HomePlanet.
* Most of the passengers were not put into a cryosleep state.
* Most of the passengers going to TRAPPIST-1e.
* only 199 passengers are VIP.

In [8]:
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1


we have problem with the "_" in PassengerId, we want to make it an integer. i'll do it for both test and train.

In [9]:
data = [train_data,test_data]
for data in data:
    data['PassengerId'] = data.PassengerId.str.replace('_','')
    data['PassengerId'] = data['PassengerId'].astype('int32')

In [10]:
train_data.head()
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,1301,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,1801,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,1901,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,2101,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,2301,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


# Dilling with missing values

In [11]:
object_cols = [col for col in train_data.columns if train_data[col].dtype == 'object']
print(f'Object columns: {object_cols}')
numerical_cols = [col for col in train_data.columns if train_data[col].dtype in ['int64','float64','int32'] and col not in ['Transported']]
print(f'Numerical columns: {numerical_cols}')


Object columns: ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'Name']
Numerical columns: ['PassengerId', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


In [12]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(random_state=42)

date = pd.Timestamp('2200-01-01')

for col in train_data:
    if train_data[col].dtype=="object":
        train_data[col].fillna("not listed", inplace=True)
    if train_data[col].dtype=="int":
        train_data[col].fillna(train_data[col].mean(), inplace=True)
    if train_data[col].dtype=='float':
        train_data[col] = imp.fit_transform(train_data[col].values.reshape(-1,1))
    if train_data[col].dtype=="datetime64[ns]":
        train_data[col].fillna(date, inplace=True)
train_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,101,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0
1,201,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1
2,301,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0
3,302,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0
4,401,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1


In [13]:
for col in test_data:
    if test_data[col].dtype=="object":
        test_data[col].fillna("not listed", inplace=True)
    if test_data[col].dtype=="int":
        test_data[col].fillna(test_data[col].mean(), inplace=True)
    if test_data[col].dtype=='float':
        test_data[col] = imp.fit_transform(test_data[col].values.reshape(-1,1))
    if test_data[col].dtype=="datetime64[ns]":
        test_data[col].fillna(date, inplace=True)
test_data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,1301,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,Nelly Carsoning
1,1801,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,Lerome Peckers
2,1901,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,Sabih Unhearfus
3,2101,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,Meratz Caltilter
4,2301,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,Brence Harperez


**is there still any missing values?**

In [14]:
print(sum(train_data.isna().sum())+sum(test_data.isna().sum()))

0


to decide my featurs, i'll take the numerical coulmns and the objectiv coulmns with not too much unique values

In [15]:
for col in object_cols:
    print('unique: ',train_data[col].unique())
    print('nunique: ',train_data[col].nunique())

unique:  ['Europa' 'Earth' 'Mars' 'not listed']
nunique:  4
unique:  [False True 'not listed']
nunique:  3
unique:  ['B/0/P' 'F/0/S' 'A/0/S' ... 'G/1499/S' 'G/1500/S' 'E/608/S']
nunique:  6561
unique:  ['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' 'not listed']
nunique:  4
unique:  [False True 'not listed']
nunique:  3
unique:  ['Maham Ofracculy' 'Juanna Vines' 'Altark Susent' ... 'Fayey Connon'
 'Celeon Hontichre' 'Propsh Hontichre']
nunique:  8474


I will drop cabin and name

In [16]:
train_data_ready = train_data.drop(['Cabin','Name'],axis=1)
test_data1_ready = test_data.drop(['Cabin','Name'],axis=1)

In [17]:
from sklearn.model_selection import train_test_split
y = train_data_ready.Transported
X = test_data1_ready.drop(['Transported'], axis=1)

# Divide data into training and validation subsets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                                random_state=0)

KeyError: "['Transported'] not found in axis"

Xgboost

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 100)

In [None]:
from xgboost import XGBClassifier

bp = {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 500, 'subsample': 1}


model = XGBClassifier(n_estimators = bp['n_estimators'], max_depth = bp['max_depth'],
                      subsample = bp['subsample'], learning_rate = bp['learning_rate'], random_state = 100).fit(X_train, y_train)

pred = model.predict(X_test)

from sklearn.metrics import accuracy_score, confusion_matrix

accuracy_score(y_test, pred)

submition of outcome

In [None]:
submition = model.predict(data_test_X)
sub_file = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
sub_file['Transported'] = submition
sub_file

change from 1/0 to True/False

In [None]:
def change_to_bool(s):
    if s == 1:
        return True
    else:
        return False
    
sub_file['Transported'] = pd.Series(sub).apply(change_to_bool)
sub_file
sub_file.to_csv('Dean_submition.csv', index = False)
