# Imports libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression

## Load data 

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

## Total spending money in RoomService, FoodCourt, ShoppingMall, Spa, VRDeck

### Add missing values in CryoSleep, Destination and in spending columns

In [3]:
exp_features = ["RoomService", "FoodCourt", "ShoppingMall", "Spa","VRDeck"]

def split_passenger_id(df: pd.DataFrame):
  df['Group'] = df['PassengerId'].apply(lambda x: int(x.split('_')[0]))
  df['Group_size'] = df['Group'].map(lambda x: pd.concat([df['Group']]).value_counts()[x])
  df['Solo'] = (df['Group_size'] == 1).astype(int)
  return df

def split_cabin(df: pd.DataFrame):
  df['Cabin'] = df['Cabin'].fillna('Z/9999/Z')

  df['Cabin_Deck'] = df['Cabin'].apply(lambda x: str(x).split('/')[0])
  df['Cabin_Side'] = df['Cabin'].apply(lambda x: str(x).split('/')[-1])

  df.loc[df['Cabin_Deck']=='Z', 'Cabin_Deck']=np.nan
  df.loc[df['Cabin_Side']=='Z', 'Cabin_Side']=np.nan

  df = df.drop(['Cabin'], axis=1)

  grouped_df_deck = df[df['Group_size']>1].groupby(['Group','Cabin_Deck'])['Cabin_Deck'].size().unstack().fillna(0)
  grouped_df_side = df[df['Group_size']>1].groupby(['Group','Cabin_Side'])['Cabin_Side'].size().unstack().fillna(0)
  
  # Cabin_Deck

  GCD_index=df[df['Cabin_Deck'].isna()][(df[df['Cabin_Deck'].isna()]['Group']).isin(grouped_df_deck.index)].index
  df.loc[GCD_index,'Cabin_Deck']=df.iloc[GCD_index,:]['Group'].map(lambda x: grouped_df_deck.idxmax(axis=1)[x])

  df.groupby(['HomePlanet','Destination','Solo','Cabin_Deck'])['Cabin_Deck'].size().unstack().fillna(0)

  na_rows_CD=df.loc[df['Cabin_Deck'].isna(),'Cabin_Deck'].index
  df.loc[df['Cabin_Deck'].isna(),'Cabin_Deck']=df.groupby(['HomePlanet','Destination','Solo'])['Cabin_Deck'].transform(lambda x: x.fillna(pd.Series.mode(x)[0]))[na_rows_CD]

  df.loc[df['Cabin_Deck'].isna(),'Cabin_Deck']='Z'
  
  # Cabin_Side

  GCS_index=df[df['Cabin_Side'].isna()][(df[df['Cabin_Side'].isna()]['Group']).isin(grouped_df_side.index)].index
  df.loc[GCS_index,'Cabin_Side']=df.iloc[GCS_index,:]['Group'].map(lambda x: grouped_df_side.idxmax(axis=1)[x])

  grouped_df_side = df[df['Group_size']>1].groupby(['Surname','Cabin_Side'])['Cabin_Side'].size().unstack().fillna(0)

  SCS_index=df[df['Cabin_Side'].isna()][(df[df['Cabin_Side'].isna()]['Surname']).isin(grouped_df_side.index)].index
  df.loc[SCS_index,'Cabin_Side']=df.iloc[SCS_index,:]['Surname'].map(lambda x: grouped_df_side.idxmax(axis=1)[x])

  df.loc[df['Cabin_Side'].isna(),'Cabin_Side']='Z'
  return df

def split_name(df: pd.DataFrame):
  df['Name'] = df['Name'].fillna('Unknown Unknown')

  df['Surname'] = df['Name'].str.split().str[-1]
  df['Family_size'] = df['Surname'].map(lambda x: df['Surname'].value_counts()[x])

  df.loc[df['Surname']=='Unknown','Surname']=np.nan
  df.loc[df['Family_size']>100,'Family_size']=np.nan

  df = df.drop(['Name'], axis=1)
  return df

def add_total_spending(data: pd.DataFrame):
  data["TotalSpending"] = data[exp_features].sum(axis=1)
  data['NoSpending']=(data['TotalSpending']==0).astype(int)
  return data

def add_missing_cryo(df: pd.DataFrame):
  df.loc[(df['CryoSleep'].isna()) & (df['TotalSpending'] > 0), 'CryoSleep'] = False
  df.loc[(df['CryoSleep'].isna()) & (df['TotalSpending'] == 0), 'CryoSleep'] = True
  return df

def add_missing_spending(df: pd.DataFrame):
  for feature in exp_features:
    df.loc[(df[feature].isna()) & (df['CryoSleep'] == True), feature] = 0
    df.loc[(df[feature].isna()) & (df['CryoSleep'] == False), feature] = df[feature].median()
  return df

def add_missing_destination(df: pd.DataFrame):
  df.loc[(df['Destination'].isna()), 'Destination']='TRAPPIST-1e'
  return df

def add_missing_homeplanet(df: pd.DataFrame):
  df_grouped = df.groupby(['Group','HomePlanet'])['HomePlanet'].size().unstack().fillna(0)
  df_index = df[df['HomePlanet'].isna()][(df[df['HomePlanet'].isna()]['Group']).isin(df_grouped.index)].index
  df.loc[df_index, 'HomePlanet'] = df.iloc[df_index,:]['Group'].map(lambda x: df_grouped.idxmax(axis=1)[x])

  grouped_df = df.groupby(['Cabin_Deck', 'HomePlanet'])['HomePlanet'].size().unstack().fillna(0)
  df.loc[(df['HomePlanet'].isna()) & (df['Cabin_Deck'].isin(['A', 'B', 'C', 'T'])), 'HomePlanet'] = 'Europa'
  df.loc[(df['HomePlanet'].isna()) & (df['Cabin_Deck'].isin(['G'])), 'HomePlanet'] = 'Earth'

  grouped_df = df.groupby(['Surname', 'HomePlanet'])['HomePlanet'].size().unstack().fillna(0)
  df_index = df[df['HomePlanet'].isna()][(df[df['HomePlanet'].isna()]['Surname']).isin(grouped_df.index)].index
  df.loc[df_index, 'HomePlanet'] = df.iloc[df_index,:]['Surname'].map(lambda x: grouped_df.idxmax(axis=1)[x])

  grouped_df = df.groupby(['HomePlanet','Destination'])['Destination'].size().unstack().fillna(0)
  df.loc[(df['HomePlanet'].isna()) & ~(df['Cabin_Deck']=='D'), 'HomePlanet']='Earth'
  df.loc[(df['HomePlanet'].isna()) & (df['Cabin_Deck']=='D'), 'HomePlanet']='Mars'
  return df

def add_group_size(df: pd.DataFrame):
  
  df_grouped = df[df['Group_size']>1].groupby(['Group','Surname'])['Surname'].size().unstack(fill_value=0)

  df_index = df[df['Surname'].isna()][(df[df['Surname'].isna()]['Group']).isin(df_grouped.index)].index
  df.loc[df_index,'Surname']=df.iloc[df_index,:]['Group'].map(lambda x: df_grouped.idxmax(axis=1)[x])

  if not df_grouped.empty:
    most_freq = df_grouped.idxmax(axis=1).to_dict()
    mask = df['Surname'].isna() & df['Group'].isin(df_grouped.index)
    df.loc[mask, 'Surname'] = df.loc[mask, 'Group'].map(most_freq)

  df['Surname'] = df['Surname'].fillna('Unknown')
  df['Family_size']=df['Surname'].map(lambda x: df['Surname'].value_counts()[x])
  df.loc[df['Surname']=='Unknown','Surname']=np.nan
  df.loc[df['Family_size']>100,'Family_size']=0
  return df

def add_missing_age(df: pd.DataFrame):
  df.groupby(['HomePlanet','NoSpending','Solo','Cabin_Deck'])['Age'].median().unstack().fillna(0)

  na_rows_A=df.loc[df['Age'].isna(),'Age'].index
  df.loc[df['Age'].isna(),'Age']=df.groupby(['HomePlanet','NoSpending','Solo','Cabin_Deck'])['Age'].transform(lambda x: x.fillna(x.median()))[na_rows_A]
  return df

def add_missing_vip(df: pd.DataFrame):
  df.loc[df['VIP'].isna(),'VIP']=False
  return df

## Clean data

In [None]:
def clean(data: pd.DataFrame):
  data = split_passenger_id(data)
  data = split_name(data)
  data = split_cabin(data)
  
  data = add_group_size(data)
  data = add_total_spending(data)
  data = add_missing_cryo(data)
  data = add_missing_spending(data)
  data = add_missing_destination(data)
  data = add_missing_homeplanet(data)
  # data = add_missing_age(data)
  data = add_missing_vip(data)
  data = data.drop(["Surname", "Group", "Group_size", "PassengerId"], axis=1)
  return data

df_train_cleaned = clean(df_train)
df_test_cleaned = clean(df_test)

## Training model

In [None]:
y = df_train_cleaned['Transported'].astype('bool')
features = ["CryoSleep", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "TotalSpending", "HomePlanet", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Destination", "Age","Cabin_Deck", "Family_size", "Solo", "Cabin_Side", "NoSpending", "VIP"]

X = pd.get_dummies(df_train_cleaned[features])
X_test = pd.get_dummies(df_test_cleaned[features])

model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=42)
model.fit(X, y)
predictions = model.predict(X_test)

output = pd.DataFrame({'PassengerId': df_test['PassengerId'], 'Transported': predictions})
output.to_csv('submission.csv', index=False)