In [91]:
import numpy as np
import pandas as pd
train = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\train.csv")
test = pd.read_csv("C:\\Users\\joshn\\Documents\\Coding\\Spaceship Titanic\\test.csv")
data = pd.concat([train, test], sort = False)

In [92]:
#DEFINING FAMILY AND GROUP

# Split the PassengerId column
id_split = data['PassengerId'].str.split('_', expand=True)
# Assign the group column to the dataframe
data['Group'] = id_split[0]
# Find family members by grouping on last name
name_split = data['Name'].str.split(' ', expand=True)
# Assign the split columns to the dataframe
data['Surname'] = name_split[1]
data['Family'] = data['Group'].astype(str) + "_" + name_split[1]
data['GroupSize'] = data.groupby('Group')['Group'].transform('size')
data['Expenses?'] = data[['VRDeck', 'ShoppingMall', 'Spa', 'RoomService', 'FoodCourt']].sum(axis=1) != 0

data.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Group,Surname,Family,GroupSize,Expenses?
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,1,Ofracculy,0001_Ofracculy,1,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,2,Vines,0002_Vines,1,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,3,Susent,0003_Susent,2,True
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,3,Susent,0003_Susent,2,True
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,4,Santantines,0004_Santantines,1,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True,5,Hinetthews,0005_Hinetthews,1,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True,6,Jacostaffey,0006_Jacostaffey,2,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True,6,Jacostaffey,0006_Jacostaffey,2,False
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True,7,Beston,0007_Beston,1,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True,8,Flatic,0008_Flatic,3,False


In [93]:
def calculate_consistency(data, group_column, check_columns):
    consistency_results = {}
    for check_column in check_columns:
        # Group by the specified column (Group, Family, or Cabin) and check if all values in the group are the same
        consistency = data.groupby(group_column)[check_column].apply(lambda x: x.nunique() == 1)
        # Calculate the mean of the boolean series to get the consistency percentage
        consistency_results[check_column] = consistency.mean()
    
    return consistency_results

# Now calculate the consistency for each column within Groups, Families, and Cabins
group_consistency = calculate_consistency(data, 'Group', ['VIP', 'HomePlanet','CryoSleep','Cabin','Family','Surname','Destination','Group'])
family_consistency = calculate_consistency(data, 'Family', ['VIP', 'HomePlanet','CryoSleep','Cabin','Family','Surname','Destination','Group'])
cabin_consistency = calculate_consistency(data, 'Cabin', ['VIP', 'HomePlanet','CryoSleep','Cabin','Family','Surname','Destination','Group'])

group_consistency, family_consistency, cabin_consistency



({'VIP': 0.9658405172413793,
  'HomePlanet': 0.9831896551724137,
  'CryoSleep': 0.850646551724138,
  'Cabin': 0.9101293103448276,
  'Family': 0.9438577586206897,
  'Surname': 0.9438577586206897,
  'Destination': 0.868103448275862,
  'Group': 1.0},
 {'VIP': 0.9677013422818792,
  'HomePlanet': 0.9826971476510067,
  'CryoSleep': 0.8644085570469798,
  'Cabin': 0.917890100671141,
  'Family': 1.0,
  'Surname': 1.0,
  'Destination': 0.8800335570469798,
  'Group': 1.0},
 {'VIP': 0.9688549618320611,
  'HomePlanet': 0.9815776081424936,
  'CryoSleep': 0.8867175572519084,
  'Cabin': 1.0,
  'Family': 0.9488040712468193,
  'Surname': 0.9488040712468193,
  'Destination': 0.890178117048346,
  'Group': 1.0})

In [101]:
def test_assumptions(data, age_col, cryo_col):
    # Check if under 13s have non-zero spending
    under_13_spending = data[(data[age_col] < 13) & (data[['VRDeck','ShoppingMall','Spa','RoomService','FoodCourt']] > 0).any(axis=1)]
    
    # Check if passengers in cryosleep have non-zero spending
    cryosleep_spending = data[(data[cryo_col] == True) & (data[['VRDeck','ShoppingMall','Spa','RoomService','FoodCourt']] > 0).any(axis=1)]

    # Check if any VIPs are in cryosleep
    vip_in_cryosleep = data[(data['VIP'] == True) & (data[cryo_col] == True)]

    #Check if non-vip single travellers without expenses are in cryosleep
    single_travellers = data[(data['GroupSize'] == 1) & (data['VIP'] == False) & (data[['VRDeck','ShoppingMall','Spa','RoomService','FoodCourt']] == 0).all(axis=1) & (data[cryo_col] == False)]

    #Check if single travellers without expenses not in cryosleep are VIPs
    no_expensesvip = data[(data['GroupSize'] == 1) & (data[cryo_col] == False) & (data['Expenses?']==True)]

    
    expenseless_vips = data[(data['VIP'] == False) & (data[cryo_col] == False)& (data['GroupSize'] == 1) & (data['Expenses?']==True)]
    
    return {
        'Under 13 Spending': under_13_spending.shape[0],
        'Cryosleep Spending': cryosleep_spending.shape[0],
        'VIPs in Cryosleep': vip_in_cryosleep.shape[0],
        'Single Travellers in Cryosleep': single_travellers.shape[0],
        'SingleVIPS without Expenses': no_expensesvip.shape[0],
        'non VIPs without Expenses': expenseless_vips.shape[0]
        
    }

assumption_test_results = test_assumptions(data, 'Age', 'CryoSleep')
assumption_test_results


{'Under 13 Spending': 0,
 'Cryosleep Spending': 0,
 'VIPs in Cryosleep': 28,
 'Single Travellers in Cryosleep': 109,
 'SingleVIPS without Expenses': 4620,
 'non VIPs without Expenses': 4418}