In [383]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

pd.set_option('future.no_silent_downcasting', True)

In [384]:
df_train = pd.read_csv("train.csv")
df_train_y = df_train["Transported"]
df_train.drop(["Transported"], axis=1, inplace=True)
df_test = pd.read_csv("test.csv")
df_test_y = pd.read_csv("sample_submission.csv")

In [385]:
def get_info(df):
    print("Total rows:", len(df_train))
    for col in df:
        d = df[col]
        c = d.unique()
        print(f"{col}: nan={d.isna().sum()}, mode={d.mode()[0]}, unique_c={len(c)}, unique={c}")

In [386]:
df_train.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines


In [387]:
get_info(df_train)

Total rows: 8693
PassengerId: nan=0, mode=0001_01, unique_c=8693, unique=['0001_01' '0002_01' '0003_01' ... '9279_01' '9280_01' '9280_02']
HomePlanet: nan=201, mode=Earth, unique_c=4, unique=['Europa' 'Earth' 'Mars' nan]
CryoSleep: nan=217, mode=False, unique_c=3, unique=[False True nan]
Cabin: nan=199, mode=G/734/S, unique_c=6561, unique=['B/0/P' 'F/0/S' 'A/0/S' ... 'G/1499/S' 'G/1500/S' 'E/608/S']
Destination: nan=182, mode=TRAPPIST-1e, unique_c=4, unique=['TRAPPIST-1e' 'PSO J318.5-22' '55 Cancri e' nan]
Age: nan=179, mode=24.0, unique_c=81, unique=[39. 24. 58. 33. 16. 44. 26. 28. 35. 14. 34. 45. 32. 48. 31. 27.  0.  1.
 49. 29. 10.  7. 21. 62. 15. 43. 47.  2. 20. 23. 30. 17. 55.  4. 19. 56.
 nan 25. 38. 36. 22. 18. 42. 37. 13.  8. 40.  3. 54.  9.  6. 64. 67. 61.
 50. 41. 57. 11. 52. 51. 46. 60. 63. 59.  5. 79. 68. 74. 12. 53. 65. 71.
 75. 70. 76. 78. 73. 66. 69. 72. 77.]
VIP: nan=203, mode=False, unique_c=3, unique=[False True nan]
RoomService: nan=181, mode=0.0, unique_c=1274, uniq

In [388]:
def make1(df):
    df = df.copy()
    df["Spent"] = df[["RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck"]].sum(axis=1)

    df = df.join(
        df["PassengerId"].str.split("_", expand=True).rename(
            columns={0:"GroupID", 1:"PassID"}
        )
    )

    df["GroupID"] = df["GroupID"].astype(int)

    df = df.join(
        df["Cabin"].str.split("/", expand=True).rename(
            columns={0:"CabinDeck", 1:"CabinNum", 2:"CabinSide"}
        )
    )

    # df = df.join(
    #     df["Name"].str.split(expand=True).rename(
    #         columns={0:"FirstName", 1:"LastName"}
    #     )
    # )

    df = df.drop(["PassengerId", "PassID", "RoomService", "FoodCourt", "ShoppingMall", "Spa", "VRDeck", "Cabin", "CabinNum", "Name", "Destination"], axis=1)

    return df

df_train_1 = make1(df_train)
df_train_1.head()

Unnamed: 0,HomePlanet,CryoSleep,Age,VIP,Spent,GroupID,CabinDeck,CabinSide
0,Europa,False,39.0,False,0.0,1,B,P
1,Earth,False,24.0,False,736.0,2,F,S
2,Europa,False,58.0,True,10383.0,3,A,S
3,Europa,False,33.0,False,5176.0,3,A,S
4,Earth,False,16.0,False,1091.0,4,F,S


In [389]:
get_info(df_train_1)

Total rows: 8693
HomePlanet: nan=201, mode=Earth, unique_c=4, unique=['Europa' 'Earth' 'Mars' nan]
CryoSleep: nan=217, mode=False, unique_c=3, unique=[False True nan]
Age: nan=179, mode=24.0, unique_c=81, unique=[39. 24. 58. 33. 16. 44. 26. 28. 35. 14. 34. 45. 32. 48. 31. 27.  0.  1.
 49. 29. 10.  7. 21. 62. 15. 43. 47.  2. 20. 23. 30. 17. 55.  4. 19. 56.
 nan 25. 38. 36. 22. 18. 42. 37. 13.  8. 40.  3. 54.  9.  6. 64. 67. 61.
 50. 41. 57. 11. 52. 51. 46. 60. 63. 59.  5. 79. 68. 74. 12. 53. 65. 71.
 75. 70. 76. 78. 73. 66. 69. 72. 77.]
VIP: nan=203, mode=False, unique_c=3, unique=[False True nan]
Spent: nan=0, mode=0.0, unique_c=2336, unique=[    0.   736. 10383. ...  8803.  8536.  4637.]
GroupID: nan=0, mode=984, unique_c=6217, unique=[   1    2    3 ... 9278 9279 9280]
CabinDeck: nan=199, mode=F, unique_c=9, unique=['B' 'F' 'A' 'G' nan 'E' 'D' 'C' 'T']
CabinSide: nan=199, mode=S, unique_c=3, unique=['P' 'S' nan]


In [None]:
def make2(df):
    df = df.copy()
    groups = df["GroupID"].value_counts()

    for g, count in groups.items():
        g_ind = df["GroupID"] == g
        group = df[g_ind]

        check = group.isna().any().any()
        if (not check):
            continue

        for col in ["HomePlanet", "CryoSleep", "VIP", "CabinDeck", "CabinSide"]:
            if ( (isna := group[col].isna()).sum()):
                mode = group[col].mode()
                if count > 1 and any(mode):
                    mode = mode[0]
                else:
                    mode = df[col].mode()[0]

                df.loc[g_ind & isna, col] = mode

        for col in ["Age", "Spent"]:
            if ( (isna := group[col].isna()).sum()):
                mean = group[col].mean()
                if np.isnan(mean):
                    mean = df[col].mean()
                df.loc[g_ind & isna, col] = mean

                # if not df.loc[g_ind & isna, col].any():
                #     print(group)
                #     print(df.loc[g_ind & isna, col], mean, not mean, mean == np.nan, np.isnan(mean))
            
        # print(df[g_ind])

    return df

df_train_2 = make2(df_train_1)

In [419]:
get_info(df_train_2)

Total rows: 8693
HomePlanet: nan=0, mode=Earth, unique_c=3, unique=['Europa' 'Earth' 'Mars']
CryoSleep: nan=0, mode=False, unique_c=2, unique=[False True]
Age: nan=0, mode=24.0, unique_c=110, unique=[39.         24.         58.         33.         16.         44.
 26.         28.         35.         14.         34.         45.
 32.         48.         31.         27.          0.          1.
 49.         29.         10.          7.         21.         62.
 15.         43.         47.          2.         20.         23.
 30.         17.         55.          4.         19.         56.
 28.80281942 25.         38.         36.         22.         18.
 42.         37.         13.          8.         40.          3.
 54.          9.          6.         64.         67.         61.
 50.         41.         57.         11.         52.         51.
 46.         60.         63.         36.33333333 59.          5.
 28.80281942 79.         68.         74.         12.         53.
 38.25       65.     

In [392]:
df = df_train_2.copy()

def make3(df):
    df = df.copy()
    df = df.join(
            [
                pd.get_dummies(df["HomePlanet"]).astype(int),
                pd.get_dummies(df["CabinDeck"]).astype(int),
                pd.get_dummies(df["CabinSide"]).astype(int)
            ]
        )

    df = df.drop(["HomePlanet", "CabinDeck", "CabinSide"], axis=1)

    df[["CryoSleep", "VIP"]] = df[["CryoSleep", "VIP"]].astype(int)

    return df

df_train_3 = make3(df_train_2)
df_train_3

Unnamed: 0,CryoSleep,Age,VIP,Spent,GroupID,Earth,Europa,Mars,A,B,C,D,E,F,G,T,P,S
0,0,39.0,0,0.0,1,0,1,0,0,1,0,0,0,0,0,0,1,0
1,0,24.0,0,736.0,2,1,0,0,0,0,0,0,0,1,0,0,0,1
2,0,58.0,1,10383.0,3,0,1,0,1,0,0,0,0,0,0,0,0,1
3,0,33.0,0,5176.0,3,0,1,0,1,0,0,0,0,0,0,0,0,1
4,0,16.0,0,1091.0,4,1,0,0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,0,41.0,1,8536.0,9276,0,1,0,1,0,0,0,0,0,0,0,1,0
8689,1,18.0,0,0.0,9278,1,0,0,0,0,0,0,0,0,1,0,0,1
8690,0,26.0,0,1873.0,9279,1,0,0,0,0,0,0,0,0,1,0,0,1
8691,0,32.0,0,4637.0,9280,0,1,0,0,0,0,0,1,0,0,0,0,1


In [393]:
from sklearn.ensemble import RandomForestClassifier

rndf = RandomForestClassifier(max_depth=20)

rndf.fit(df_train_3, df_train_y)

In [394]:
rndf.score(df_train_3, df_train_y)

0.9887265616012884

In [395]:
from sklearn.ensemble import GradientBoostingClassifier

xgb = GradientBoostingClassifier(n_estimators=500, learning_rate=1.0)
xgb.fit(df_train_3, df_train_y)

In [396]:
xgb.score(df_train_3, df_train_y)

0.9654894742896584

In [420]:
df_test_3 = make3(make2(make1(df_test)))
df_test_3

Unnamed: 0,CryoSleep,Age,VIP,Spent,GroupID,Earth,Europa,Mars,A,B,C,D,E,F,G,T,P,S
0,1,27.000000,0,0.0,13,1,0,0,0,0,0,0,0,0,1,0,0,1
1,0,19.000000,0,2832.0,18,1,0,0,0,0,0,0,0,1,0,0,0,1
2,1,31.000000,0,0.0,19,0,1,0,0,0,1,0,0,0,0,0,0,1
3,0,38.000000,0,7418.0,21,0,1,0,0,0,1,0,0,0,0,0,0,1
4,0,20.000000,0,645.0,23,1,0,0,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,1,34.000000,0,0.0,9266,1,0,0,0,0,0,0,0,0,1,0,0,1
4273,0,42.000000,0,1018.0,9269,1,0,0,0,0,0,0,0,1,0,0,0,1
4274,1,28.621492,0,0.0,9271,0,0,1,0,0,0,1,0,0,0,0,1,0
4275,0,28.621492,0,3203.0,9273,0,1,0,0,0,0,1,0,0,0,0,1,0


In [421]:
get_info( df_test_3 )

Total rows: 8693
CryoSleep: nan=0, mode=0, unique_c=2, unique=[1 0]
Age: nan=0, mode=18.0, unique_c=93, unique=[27.         19.         31.         38.         20.         21.
 23.         24.         45.         44.         46.         29.
 40.         30.         14.         66.         36.         18.
 26.         48.          6.         33.          1.         17.
 34.          5.         22.         16.          7.          2.
  0.         56.         39.         28.62149216 35.          8.
  3.          9.         25.         42.         50.         41.
 32.         49.         55.         60.         47.         58.
 28.         12.         52.         15.         29.42857143 43.
 54.         70.         37.         59.         61.         63.
 39.33333333 53.          4.         10.         65.         13.
 51.         79.         19.5        28.62149216 74.         67.
 69.         57.         72.         75.         11.         16.5
 73.         71.         28.61312348 64.   

In [422]:
pred_f = rndf.predict(df_test_3)
pred_x = xgb.predict(df_test_3)

accuracy_score(pred_f, pred_x)

0.8036006546644845