## Data Exploration - Spaceship Titanic Kaggle Competition

In [1]:
import boto3
import pandas as pd
import numpy as np
import cleaning
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
pd.set_option('display.max_columns', 50)

In [2]:
## Defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448-bucket-callaghan'
bucket = s3.Bucket(bucket_name)

## Using pandas to read the data files

file_key = 'titanic_train.csv'
file_key2 = 'titanic_test.csv'

bucket_object = bucket.Object(file_key)
bucket_object2 = bucket.Object(file_key2)

file_object = bucket_object.get()
file_object2 = bucket_object2.get()

file_content_stream = file_object.get('Body')
file_content_stream2 = file_object2.get('Body')

train = pd.read_csv(file_content_stream)
test = pd.read_csv(file_content_stream2)

train.head()



Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


### Exploration

In [None]:
## Home Planet:

print('Training Set:\n',train['HomePlanet'].value_counts())
print('\nTesting Set:\n',test['HomePlanet'].value_counts())

In [None]:
## Cryo Sleep:

print('Training Set:\n',train['CryoSleep'].value_counts())
print('\nTesting Set:\n',test['CryoSleep'].value_counts())

In [None]:
## Cabin:

print(train['Cabin'].nunique())
print(test['Cabin'].nunique())

In [None]:
## Destination:

print('Training Set:\n',train['Destination'].value_counts())
print('\nTesting Set:\n',test['Destination'].value_counts())

In [None]:
## Visualizing features:

## Creating a new figure
fig, axes = plt.subplots(6, 2, figsize = (16, 20))

axes[0, 0].hist([train['Age']], bins = 10)
axes[0, 0].set_ylabel('Age (Train)')
axes[0, 0].grid()

axes[0, 1].hist([test['Age']], bins = 10)
axes[0, 1].set_ylabel('Age (Test)')
axes[0, 1].grid()

axes[1, 0].hist([train['RoomService']])
axes[1, 0].set_ylabel('RoomService (Train)')
axes[1, 0].grid()

axes[1, 1].hist([test['RoomService']])
axes[1, 1].set_ylabel('RoomService (Test)')
axes[1, 1].grid()

axes[2, 0].hist([train['FoodCourt']])
axes[2, 0].set_ylabel('FoodCourt (Train)')
axes[2, 0].grid()

axes[2, 1].hist([train['FoodCourt']])
axes[2, 1].set_ylabel('FoodCourt (Test)')
axes[2, 1].grid()

axes[3, 0].hist([train['ShoppingMall']])
axes[3, 0].set_ylabel('ShoppingMall (Train)')
axes[3, 0].grid()

axes[3, 1].hist([train['ShoppingMall']])
axes[3, 1].set_ylabel('ShoppingMall (Test)')
axes[3, 1].grid()

axes[4, 0].hist([train['Spa']])
axes[4, 0].set_ylabel('Spa (Train)')
axes[4, 0].grid()

axes[4, 1].hist([train['Spa']])
axes[4, 1].set_ylabel('Spa (Test)')
axes[4, 1].grid()

axes[5, 0].hist([train['VRDeck']])
axes[5, 0].set_ylabel('VRDeck (Train)')
axes[5, 0].grid()

axes[5, 1].hist([train['VRDeck']])
axes[5, 1].set_ylabel('VRDeck (Test)')
axes[5, 1].grid()

In [None]:
## VIP:

print('Training Set:\n',train['VIP'].value_counts())
print('\nTesting Set:\n',test['VIP'].value_counts())

### Data Cleaning / Variable Engineering

In [3]:
## Looking at missing values:

print('Training:', train.isnull().sum())

print('\nTesting:', test.isnull().sum())

Training: PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

Testing: PassengerId       0
HomePlanet       87
CryoSleep        93
Cabin           100
Destination      92
Age              91
VIP              93
RoomService      82
FoodCourt       106
ShoppingMall     98
Spa             101
VRDeck           80
Name             94
dtype: int64


In [4]:
## Extracting group number and passenger number from PassengerID variable
  
## Creating a new data frame with split value columns
split = train['PassengerId'].str.split('_', n = 1, expand = True)

# Defining new columns from split data frame
train['GroupNumber'] = split[0]
train['GroupPerson'] = split[1]

## Chaging GroupPeople to int
train['GroupPerson'] = train['GroupPerson'].astype(int)


## Creating a new data frame with split value columns
split = test['PassengerId'].str.split('_', n = 1, expand = True)

# Defining new columns from split data frame
test['GroupNumber']= split[0]
test['GroupPerson']= split[1]

## Chaging GroupPeople to int
test['GroupPerson'] = test['GroupPerson'].astype(int)

In [5]:
## Creating GroupPeopleTotal variables

## Creating new temp data frame
totals = pd.DataFrame(train.groupby('GroupNumber')['GroupPerson'].max()).reset_index(drop = False)

## Renaming columns
totals.rename(columns = {'GroupPerson':'GroupTotal'}, inplace = True)

## Joining the data frames
train = train.merge(totals, how = 'left')


## Creating new temp data frame
totals = pd.DataFrame(test.groupby('GroupNumber')['GroupPerson'].max()).reset_index(drop = False)

## Renaming columns
totals.rename(columns = {'GroupPerson':'GroupTotal'}, inplace = True)

## Joining the data frames
test = test.merge(totals, how = 'left')

In [6]:
## Cleaning HomePlanet variable using the clean_method_mode function

train = cleaning.clean_method_mode(train, 'HomePlanet')
test = cleaning.clean_method_mode(test, 'HomePlanet')

In [7]:
## Cleaning Destination variable using the clean_method_mode function

train = cleaning.clean_method_mode(train, 'Destination')
test = cleaning.clean_method_mode(test, 'Destination')

In [8]:
## Cleaning Age variable using the clean_method_mean function

train = cleaning.clean_method_mean(train, 'Age')
test = cleaning.clean_method_mean(test, 'Age')

In [9]:
## Cleaning RoomService, FoodCourt, ShoppingMall, Spa, and VRDeck variables using the clean_method_zero function

train = cleaning.clean_method_zero(train, 'RoomService')
train = cleaning.clean_method_zero(train, 'FoodCourt')
train = cleaning.clean_method_zero(train, 'ShoppingMall')
train = cleaning.clean_method_zero(train, 'Spa')
train = cleaning.clean_method_zero(train, 'VRDeck')

test = cleaning.clean_method_zero(test, 'RoomService')
test = cleaning.clean_method_zero(test, 'FoodCourt')
test = cleaning.clean_method_zero(test, 'ShoppingMall')
test = cleaning.clean_method_zero(test, 'Spa')
test = cleaning.clean_method_zero(test, 'VRDeck')

In [10]:
## Cleaning Name variable using the clean_method_string function

train = cleaning.clean_method_string(train, 'Name')
test = cleaning.clean_method_string(test, 'Name')

In [11]:
## Cleaning CryoSleep and VIP variables using the clean_method_boolean function

train = cleaning.clean_method_boolean(train, 'CryoSleep')
train = cleaning.clean_method_boolean(train, 'VIP')

test = cleaning.clean_method_boolean(test, 'CryoSleep')
test = cleaning.clean_method_boolean(test, 'VIP')

In [12]:
## Cleaning Cabin variable using the clean_method_cabin function

train = cleaning.clean_method_cabin(train, 'Cabin')
test = cleaning.clean_method_cabin(test, 'Cabin')

In [13]:
## Looking at missing values after cleaning:

print('Training:', train.isnull().sum())

print('\nTesting:', test.isnull().sum())

Training: PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
Transported     0
GroupNumber     0
GroupPerson     0
GroupTotal      0
dtype: int64

Testing: PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Name            0
GroupNumber     0
GroupPerson     0
GroupTotal      0
dtype: int64


In [14]:
## Changing HomePlanet to numeric
train['HomePlanet'] = np.where(train['HomePlanet'] == 'Earth', 0, 
                              np.where(train['HomePlanet'] == 'Europa', 1, 2))
test['HomePlanet'] = np.where(test['HomePlanet'] == 'Earth', 0, 
                              np.where(test['HomePlanet'] == 'Europa', 1, 2))

## Changing CryoSleep to numeric
train['CryoSleep'] = np.where(train['CryoSleep'] == False, 0, 1)
test['CryoSleep'] = np.where(test['CryoSleep'] == False, 0, 1)

## Changing Destination to numeric
train['Destination'] = np.where(train['Destination'] == 'TRAPPIST-1e', 0, 
                               np.where(train['Destination'] == '55 Cancri e', 1, 2))
test['Destination'] = np.where(test['Destination'] == 'TRAPPIST-1e', 0, 
                               np.where(test['Destination'] == '55 Cancri e', 1, 2))

## Changing VIP to numeric
train['VIP'] = np.where(train['VIP'] == False, 0, 1)
test['VIP'] = np.where(test['VIP'] == False, 0, 1)

## Changing Transported to numeric
train['Transported'] = np.where(train['Transported'] == False, 0, 1)

In [15]:
## Creating First and Last Name variables
  
## Creating a new data frame with split value columns
split = train['Name'].str.split(' ', n = 1, expand = True)

# Defining new columns from split data frame
train['First'] = split[0]
train['Last'] = split[1]



## Creating a new data frame with split value columns
split = test['Name'].str.split(' ', n = 1, expand = True)

# Defining new columns from split data frame
test['First']= split[0]
test['Last']= split[1]

In [16]:
train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,GroupNumber,GroupPerson,GroupTotal,First,Last
0,0001_01,1,0,B/0/P,0,39.0,0,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,1,1,1,Maham,Ofracculy
1,0002_01,0,0,F/0/S,0,24.0,0,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,2,1,1,Juanna,Vines
2,0003_01,1,0,A/0/S,0,58.0,1,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,3,1,2,Altark,Susent
3,0003_02,1,0,A/0/S,0,33.0,0,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,3,2,2,Solam,Susent
4,0004_01,0,0,F/1/S,0,16.0,0,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,4,1,1,Willy,Santantines


In [None]:
## Extracting deck, num, and side from the Cabin variable

## Defining new variables
train['Deck'] = ''
train['Num'] = ''
train['Side'] = ''

for i in range(0, train.shape[0]):
    
    ## Extracting info
    info = train.at[i, 'Cabin'].split('/')
    
    ## Assigning vales to variables
#     train.at[i, 'Deck'] = info[0]
#     train.at[i, 'Num'] = info[1]
#     train.at[i, 'Side'] = info[2]
    train.loc[i, ]
    

## Defining new variables
test['Deck'] = ''
test['Num'] = ''
test['Side'] = ''

for i in range(0, test.shape[0]):
    
    ## Extracting info
    info = test.at[i, 'Cabin'].split('/')
    
    ## Assigning vales to variables
    test.at[i, 'Deck'] = info[0]
    test.at[i, 'Num'] = info[1]
    test.at[i, 'Side'] = info[2]

#check value counts and create dummy/numeric variables

In [None]:
## Extracting group number and travel number from PassengerID

PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.

In [None]:
## Removing Cabin and Name
train = train.drop(columns = ['Cabin', 'Name'])
test = test.drop(columns = ['Cabin', 'Name'])

In [None]:
## Creating dummy variable for Destination



## Creating dummy variable for HomePlanet
train = pd.concat([train.drop(columns = ['HomePlanet'])])

Earth     4602
Europa    2131
Mars 

train = pd.concat([train.drop(columns = ['trustLevel']), pd.get_dummies(train['trustLevel'])], axis = 1)
test = pd.concat([test.drop(columns = ['trustLevel']), pd.get_dummies(test['trustLevel'])], axis = 1)

train = train.rename(columns = { 1: 'trustLevel_1', 2: 'trustLevel_2', 3: 'trustLevel_3', 
                                4: 'trustLevel_4', 5: 'trustLevel_5', 6: 'trustLevel_6'})
test = test.rename(columns = { 1: 'trustLevel_1', 2: 'trustLevel_2', 3: 'trustLevel_3', 
                                4: 'trustLevel_4', 5: 'trustLevel_5', 6: 'trustLevel_6'})

## Creating dummy variable for Transported



In [None]:
train.head()