# Imports

In [265]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import seaborn as sns

import scipy.stats as stats

from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

pd.set_option('display.max_columns', None)

# Load Data

- Load Train
- Load Test
- Combine Train and Test

In [266]:
train = pd.read_csv('../../data/original_data/train.csv')
test = pd.read_csv('../../data/original_data/test.csv')

len_train = len(train)
len_test = len(test)

train.shape, test.shape

((8693, 14), (4277, 13))

In [267]:
df = pd.concat([train, test])

## Confirm Concatenated Shape

In [268]:
df.shape[0] == test.shape[0] + train.shape[0]

True

In [269]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


# Feature Engineering

Early Feature Engineering to include:

- LastName
- SoloPassenger
- GroupSize
- CabinDeck
- CabinNum
- CabinSide

In [270]:
# Extract the Last Name to be used later
df['LastName'] = df['Name'].apply(lambda x: x.split(' ')[1] if not pd.isna(x) else np.nan)

# Create Passenger Group from gggg of PassengerId
df['PassengerGroup'] = df['PassengerId'].apply(lambda x: x.split('_')[0])

# Take Group size by number of passengers in group
group_size = df.groupby('PassengerGroup').size().reset_index(name='GroupSize')
df = df.merge(group_size, on='PassengerGroup')

# Determine if the passenger is riding solo
df['SoloPassenger'] = df['GroupSize'].apply(lambda x: 1 if x == 1 else 0)

# Split Cabin into three separate columns
df[['CabinDeck', 'CabinNum', 'CabinSide']] = df['Cabin'].str.split('/', expand=True)

# Null Values

## HomePlanet

- Check for passengers in same PassengerGroup
- Check for passengers with same surname
- Use mode to imput

In [271]:
display(f'The number of missing HomePlanet Values before imputation by PassengerGroup is: {df['HomePlanet'].isna().sum()}')

df['HomePlanet'] = df.groupby('PassengerGroup')['HomePlanet'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

display(f'The number of missing HomePlanet Values after imputation by PassengerGroup is: {df['HomePlanet'].isna().sum()}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------

def impute_home_planet(row, df):
    if pd.isna(row['HomePlanet']):
        last_name = row['LastName']
        same_last_name = df[(df['LastName'] == last_name) & (df['HomePlanet'].notna())]
        if not same_last_name.empty:
            return same_last_name['HomePlanet'].values[0]
    return row['HomePlanet']


df['HomePlanet'] = df.apply(lambda row: impute_home_planet(row, df), axis=1)

display(f'The number of missing HomePlanet Values after imputation by HomePlanet and LastNAme is: {df['HomePlanet'].isna().sum()}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------

df['HomePlanet'] = df['HomePlanet'].fillna(df['HomePlanet'].mode()[0])

display(f'The number of missing HomePlanet Values after imputation by mode is: {df['HomePlanet'].isna().sum()}')

'The number of missing HomePlanet Values before imputation by PassengerGroup is: 288'

  df['HomePlanet'] = df.groupby('PassengerGroup')['HomePlanet'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))


'The number of missing HomePlanet Values after imputation by PassengerGroup is: 157'

'The number of missing HomePlanet Values after imputation by HomePlanet and LastNAme is: 13'

'The number of missing HomePlanet Values after imputation by mode is: 0'

## Destination

- Check for passengers in same PassengerGroup
- Check for passengers with same surname
- Use mode to imput

In [272]:
display(f'The number of missing Destination Values before imputation by PassengerGroup is: {df['Destination'].isna().sum()}')

df['Destination'] = df.groupby('PassengerGroup')['Destination'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

display(f'The number of missing Destination Values after imputation by PassengerGroup is: {df['Destination'].isna().sum()}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------

def impute_destination(row, df):
    if pd.isna(row['Destination']):
        last_name = row['LastName']
        same_last_name = df[(df['LastName'] == last_name) & (df['Destination'].notna())]
        if not same_last_name.empty:
            return same_last_name['Destination'].values[0]
    return row['Destination']


df['Destination'] = df.apply(lambda row: impute_destination(row, df), axis=1)

display(f'The number of missing Destination Values after imputation by Destination and LastName is: {df['Destination'].isna().sum()}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------

df['Destination'] = df['Destination'].fillna(df['Destination'].mode()[0])

display(f'The number of missing Destination Values after imputation by mode is: {df['Destination'].isna().sum()}')

'The number of missing Destination Values before imputation by PassengerGroup is: 274'

  df['Destination'] = df.groupby('PassengerGroup')['Destination'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))


'The number of missing Destination Values after imputation by PassengerGroup is: 154'

'The number of missing Destination Values after imputation by Destination and LastName is: 6'

'The number of missing Destination Values after imputation by mode is: 0'

## Cabin

### Deck

In [273]:
display(f'The number of missing CabinDeck Values before imputation is: {df['CabinDeck'].isna().sum()}')

df['CabinDeck'] = df.groupby('PassengerGroup')['CabinDeck'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

display(f'The number of missing CabinDeck Values after imputation by PassengerGroup is: {df['CabinDeck'].isna().sum()}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------

def impute_deck(row, df):
    if pd.isna(row['CabinDeck']):
        last_name = row['LastName']
        same_last_name = df[(df['LastName'] == last_name) & (df['CabinDeck'].notna())]
        if not same_last_name.empty:
            return same_last_name['CabinDeck'].values[0]
    return row['CabinDeck']


df['CabinDeck'] = df.apply(lambda row: impute_deck(row, df), axis=1)

missing_decks_after_first_imputation = df['CabinDeck'].isna().sum()

display(f'The number of missing CabinDeck values after imputation by LastName is: {missing_decks_after_first_imputation}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------

if missing_decks_after_first_imputation > 0:
    mode_deck = df['CabinDeck'].mode()[0]
    df['CabinDeck'].fillna(mode_deck, inplace=True)

missing_decks_after_mode_imputation = df['CabinDeck'].isna().sum()
display(f'The number of missing CabinDeck values after imputation by mode is: {missing_decks_after_mode_imputation}')

'The number of missing CabinDeck Values before imputation is: 299'

  df['CabinDeck'] = df.groupby('PassengerGroup')['CabinDeck'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))


'The number of missing CabinDeck Values after imputation by PassengerGroup is: 162'

'The number of missing CabinDeck values after imputation by LastName is: 5'

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['CabinDeck'].fillna(mode_deck, inplace=True)


'The number of missing CabinDeck values after imputation by mode is: 0'

### Side

In [274]:
display(f'The number of missing CabinSide Values before imputation by PassengerGroup is: {df['CabinSide'].isna().sum()}')

df['CabinSide'] = df.groupby('PassengerGroup')['CabinSide'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

display(f'The number of missing CabinSide Values after imputation by PassengerGroup is: {df['CabinSide'].isna().sum()}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------

def impute_side(row, df):
    if pd.isna(row['CabinSide']):
        last_name = row['LastName']
        same_last_name = df[(df['LastName'] == last_name) & (df['CabinSide'].notna())]
        if not same_last_name.empty:
            return same_last_name['CabinSide'].values[0]
    return row['CabinSide']

df['CabinSide'] = df.apply(lambda row: impute_side(row, df), axis=1)
missing_sides_after_first_imputation = df['CabinSide'].isna().sum()

display(f'The number of missing CabinSide values after imputation imputation by LastName is: {missing_sides_after_first_imputation}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------

if missing_sides_after_first_imputation > 0:
    mode_sides = df['CabinSide'].mode()[0]
    df['CabinSide'] = df['CabinSide'].fillna(mode_sides)

missing_sides_after_mode_imputation = df['CabinSide'].isna().sum()
display(f'The number of missing CabinSide values after imputation by mode is: {missing_sides_after_mode_imputation}')

'The number of missing CabinSide Values before imputation by PassengerGroup is: 299'

  df['CabinSide'] = df.groupby('PassengerGroup')['CabinSide'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))


'The number of missing CabinSide Values after imputation by PassengerGroup is: 162'

'The number of missing CabinSide values after imputation imputation by LastName is: 5'

'The number of missing CabinSide values after imputation by mode is: 0'

### Num

In [275]:
print(f'The number of missing CabinNum values before imputation is: {df["CabinNum"].isna().sum()}')

df['CabinNum'] = df.groupby('PassengerGroup')['CabinNum'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

print(f'The number of missing CabinNum values after imputation by PassengrGroupis: {df["CabinNum"].isna().sum()}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------

def impute_num(row, df):
    if pd.isna(row['CabinNum']):
        last_name = row['LastName']
        same_last_name = df[(df['LastName'] == last_name) & (df['CabinNum'].notna())]
        if not same_last_name.empty:
            return same_last_name['CabinNum'].values[0]
    return row['CabinNum']

df['CabinNum'] = df.apply(lambda row: impute_num(row, df), axis=1)

missing_num_after_first_imputation = df['CabinNum'].isna().sum()

print(f'The number of missing CabinNum values after imputation by LastName is: {missing_num_after_first_imputation}')
# -----------------------------------------------------------------------------------------------------------------------------------------------------
if missing_num_after_first_imputation > 0:
    mode_num = df['CabinNum'].mode()[0]
    df['CabinNum'] = df['CabinNum'].fillna(mode_num)

missing_num_after_mode_imputation = df['CabinNum'].isna().sum()

print(f'The number of missing CabinNum values after imputation by mode is: {missing_num_after_mode_imputation}')
df['CabinNum'] = df['CabinNum'].astype(int)

The number of missing CabinNum values before imputation is: 299


  df['CabinNum'] = df.groupby('PassengerGroup')['CabinNum'].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))


The number of missing CabinNum values after imputation by PassengrGroupis: 162
The number of missing CabinNum values after imputation by LastName is: 5
The number of missing CabinNum values after imputation by mode is: 0


## RF Imputation

- CryoSleep
- Age
- RoomService
- FoodCourt
- ShoppingMall
- VRDeck
- Spa

In [280]:
df['CryoSleep'] = df['CryoSleep'].astype('float')

rf_regressor = RandomForestRegressor(n_estimators=100, random_state=0)
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=0)

imputer = IterativeImputer(estimator=rf_regressor, random_state=0)

numerical_cols = ["Age", "RoomService", "FoodCourt", "ShoppingMall", "VRDeck", "Spa"]

df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

cryo_df = pd.DataFrame(df['CryoSleep'], columns=['CryoSleep'])

imputer_categorical = IterativeImputer(estimator=rf_classifier, random_state=0)
cryo_df_imputed = imputer_categorical.fit_transform(cryo_df)

cryo_df_imputed_rounded = np.round(cryo_df_imputed)
df['CryoSleep'] = np.where(df['CryoSleep'].isna(), np.nan, cryo_df_imputed_rounded)



In [282]:
df_VIP = df[df['VIP'] == True]
df_non = df[df['VIP'] == False]

average_spent_VIP = round(np.mean(df_VIP['RoomService'] + df_VIP['FoodCourt'] + df_VIP['ShoppingMall'] + df_VIP['Spa'] + df_VIP['VRDeck']), 2)
average_spent_non = round(np.mean(df_non['RoomService'] + df_non['FoodCourt'] + df_non['ShoppingMall'] + df_non['Spa'] + df_non['VRDeck']), 2)

display(f"VIPs spent {average_spent_VIP}. On the other hand, non-VIPs spent {average_spent_non}.")

'VIPs spent 4649.59. On the other hand, non-VIPs spent 1391.3.'

In [283]:
std_spent_VIP = round(np.std(df_VIP['RoomService'] + df_VIP['FoodCourt'] + df_VIP['ShoppingMall'] + df_VIP['Spa'] + df_VIP['VRDeck']))
se = std_spent_VIP / df_VIP.shape[0]
confidence_level = 0.95
z_value = stats.norm.ppf((1 + confidence_level) / 2)
margin_of_error = round(z_value * se, 2)

display(f"The Margin of Error is {margin_of_error}")
display(f"The lowerbound is {average_spent_VIP - margin_of_error}")

'The Margin of Error is 39.21'

'The lowerbound is 4610.38'

In [284]:
mv_vip_raw = df['VIP'].isna().sum()

df['Expenditure'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
df.loc[df['VIP'].isna(), 'VIP'] = df['Expenditure'] >= 4772.18

mv_vip_after = df['VIP'].isna().sum()

display('The number of missing values in Destination before: ', mv_vip_raw)
display('The number of missing values in Destination after cleaning: ', mv_vip_after)

'The number of missing values in Destination before: '

np.int64(296)

'The number of missing values in Destination after cleaning: '

np.int64(0)

In [285]:
df.isna().sum()

PassengerId          0
HomePlanet           0
CryoSleep            0
Cabin              299
Destination          0
Age                  0
VIP                  0
RoomService          0
FoodCourt            0
ShoppingMall         0
Spa                  0
VRDeck               0
Name               294
Transported       4277
LastName           294
PassengerGroup       0
GroupSize            0
SoloPassenger        0
CabinDeck            0
CabinNum             0
CabinSide            0
Expenditure          0
dtype: int64

# Data Transformation

## CabinNum

In [286]:
def cabin_num_categorise(cabin_num):
    if cabin_num <= 300:
        return 0
    elif cabin_num <= 600:
        return 1
    elif cabin_num <= 900:
        return 2
    elif cabin_num <= 1200:
        return 3
    elif cabin_num <= 1500:
        return 4
    elif cabin_num <= 1800:
        return 5
    else:
        return 6

df['CabinNumGroup'] = df['CabinNum'].apply(lambda x: cabin_num_categorise(x))

## Age

In [287]:
def age_categorise(age):
    if age <= 12:
        return 0
    elif age <= 17:
        return 1
    elif age <= 25:
        return 2
    elif age <= 30:
        return 3
    elif age <= 50:
        return 4
    else:
        return 5
    
df['AgeGroup'] = df['Age'].apply(lambda x: age_categorise(x))

In [288]:
df = df.drop(columns=['Cabin', 'Name', 'LastName', 'PassengerGroup', 'CabinNum', 'Age'])
df.head(3)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupSize,SoloPassenger,CabinDeck,CabinSide,Expenditure,CabinNumGroup,AgeGroup
0,0001_01,Europa,0.0,TRAPPIST-1e,False,0.0,0.0,0.0,0.0,0.0,False,1,1,B,P,0.0,0,4
1,0002_01,Earth,0.0,TRAPPIST-1e,False,109.0,9.0,25.0,549.0,44.0,True,1,1,F,S,736.0,0,2
2,0003_01,Europa,0.0,TRAPPIST-1e,True,43.0,3576.0,0.0,6715.0,49.0,False,2,0,A,S,10383.0,0,5


## Expenditures

In [289]:
numerical_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Expenditure']
df[numerical_cols] = df[numerical_cols].apply(pd.to_numeric)

# Calculate skewness before log transformation
skewness_before = df[numerical_cols].apply(lambda x: stats.skew(x.dropna()))
display("Skewness before log transformation:\n", skewness_before)

'Skewness before log transformation:\n'

RoomService      6.141872
FoodCourt        7.078508
ShoppingMall    11.019204
Spa              7.661886
VRDeck           8.101704
Expenditure      4.499639
dtype: float64

In [290]:
df = df.copy()
df[numerical_cols] = df[numerical_cols].apply(lambda x: np.log1p(x))

skewness_after = df[numerical_cols].apply(lambda x: stats.skew(x.dropna()))
print("Skewness after log transformation:\n", skewness_after)

Skewness after log transformation:
 RoomService     1.066458
FoodCourt       1.067053
ShoppingMall    1.142056
Spa             1.054067
VRDeck          1.145908
Expenditure    -0.262186
dtype: float64


# Encode and Categorise

In [291]:
catcols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP', 'CabinDeck', 'SoloPassenger', 'CabinSide', 'CabinNumGroup', 'AgeGroup']

ohe = OneHotEncoder(categories='auto')

array_hot_encoded = ohe.fit_transform(df[catcols]).toarray()

# Get the feature names for the one-hot encoded columns
feature_names = ohe.get_feature_names_out(catcols)

# Create a DataFrame from the one-hot encoded array
data_hot_encoded = pd.DataFrame(array_hot_encoded, index=df.index, columns=feature_names)

# Drop the original categorical columns
data_other_cols = df.drop(columns=catcols)

# Concatenate the one-hot encoded columns with the rest of the DataFrame
df = pd.concat([data_hot_encoded, data_other_cols], axis=1)

In [292]:
df.head()

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_0.0,CryoSleep_1.0,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,SoloPassenger_0,SoloPassenger_1,CabinSide_P,CabinSide_S,CabinNumGroup_0,CabinNumGroup_1,CabinNumGroup_2,CabinNumGroup_3,CabinNumGroup_4,CabinNumGroup_5,CabinNumGroup_6,AgeGroup_0,AgeGroup_1,AgeGroup_2,AgeGroup_3,AgeGroup_4,AgeGroup_5,PassengerId,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,GroupSize,Expenditure
0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0001_01,0.0,0.0,0.0,0.0,0.0,False,1,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0002_01,4.70048,2.302585,3.258097,6.309918,3.806662,True,1,6.602588
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0003_01,3.78419,8.18228,0.0,8.812248,3.912023,False,2,9.248021
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0003_02,0.0,7.157735,5.918894,8.110728,5.267858,False,2,8.551981
4,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0004_01,5.717028,4.26268,5.023881,6.338594,1.098612,True,1,6.995766


# Divide Into Train and Test

In [293]:
train_preprocessed = df[:len_train].copy()
test_preprocessed = df[len_train:].copy()

train_preprocessed.reset_index(drop=True, inplace=True)
test_preprocessed.reset_index(drop=True, inplace=True)

test_preprocessed.drop(columns=['Transported'], inplace=True)

len(test) == len(test_preprocessed)
test_preprocessed.head(3)

Unnamed: 0,HomePlanet_Earth,HomePlanet_Europa,HomePlanet_Mars,CryoSleep_0.0,CryoSleep_1.0,Destination_55 Cancri e,Destination_PSO J318.5-22,Destination_TRAPPIST-1e,VIP_False,VIP_True,CabinDeck_A,CabinDeck_B,CabinDeck_C,CabinDeck_D,CabinDeck_E,CabinDeck_F,CabinDeck_G,CabinDeck_T,SoloPassenger_0,SoloPassenger_1,CabinSide_P,CabinSide_S,CabinNumGroup_0,CabinNumGroup_1,CabinNumGroup_2,CabinNumGroup_3,CabinNumGroup_4,CabinNumGroup_5,CabinNumGroup_6,AgeGroup_0,AgeGroup_1,AgeGroup_2,AgeGroup_3,AgeGroup_4,AgeGroup_5,PassengerId,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,GroupSize,Expenditure
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0013_01,0.0,0.0,0.0,0.0,0.0,1,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0018_01,0.0,2.302585,0.0,7.94591,0.0,1,7.949091
2,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0019_01,0.0,0.0,0.0,0.0,0.0,1,0.0


# Save New DFs

In [294]:
train_preprocessed.to_csv('../../data/preproc_data/train_2_1.csv', index=False)
test_preprocessed.to_csv('../../data/preproc_data/test_2_1.csv', index=False)