In [30]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [31]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_sub = pd.read_csv('sample_submission.csv')

In [32]:
df_train = df_train.drop(['Cabin', 'Name', 'PassengerId'], axis=1)
df_test = df_test.drop(['Cabin', 'Name', 'PassengerId'], axis=1)

In [33]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    8492 non-null   object 
 1   CryoSleep     8476 non-null   object 
 2   Destination   8511 non-null   object 
 3   Age           8514 non-null   float64
 4   VIP           8490 non-null   object 
 5   RoomService   8512 non-null   float64
 6   FoodCourt     8510 non-null   float64
 7   ShoppingMall  8485 non-null   float64
 8   Spa           8510 non-null   float64
 9   VRDeck        8505 non-null   float64
 10  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(4)
memory usage: 687.8+ KB


In [34]:
df_train['HomePlanet'].value_counts()
df_train['HomePlanet'].isnull().sum()

201

In [35]:
df_train['HomePlanet'].loc[:5000] = df_train['HomePlanet'].loc[:5000].fillna('Earth')
df_train['HomePlanet'].loc[5000:7000] = df_train['HomePlanet'].loc[5000:7000].fillna('Europa')
df_train['HomePlanet']= df_train['HomePlanet'].fillna('Mars')

df_test['HomePlanet'].loc[:5000] = df_test['HomePlanet'].loc[:5000].fillna('Earth')
df_test['HomePlanet'].loc[5000:7000] = df_test['HomePlanet'].loc[5000:7000].fillna('Europa')
df_test['HomePlanet']= df_test['HomePlanet'].fillna('Mars')

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_train['HomePlanet'].loc[:5000] = df_train['HomePlanet'].loc[:5000].fillna('Earth')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-

In [36]:
df_train['CryoSleep'].value_counts()
# df_train['CryoSleep'].isnull().sum()

CryoSleep
False    5439
True     3037
Name: count, dtype: int64

In [37]:
df_train['CryoSleep'].loc[:6000].isnull().sum()
df_test['CryoSleep'].loc[:6000].isnull().sum()

93

In [38]:
df_train['CryoSleep'].loc[:6000] = df_train['CryoSleep'].loc[:6000].fillna(False)
df_train['CryoSleep'] = df_train['CryoSleep'].fillna(True)

df_test['CryoSleep'].loc[:6000] = df_test['CryoSleep'].loc[:6000].fillna(False)
df_test['CryoSleep'] = df_test['CryoSleep'].fillna(True)

  df_train['CryoSleep'].loc[:6000] = df_train['CryoSleep'].loc[:6000].fillna(False)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_train['CryoSleep'].loc[:6000] = df_train['CryoSleep'].loc[:6000].fillna(False)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: 

In [39]:
df_train['Destination'].value_counts()

Destination
TRAPPIST-1e      5915
55 Cancri e      1800
PSO J318.5-22     796
Name: count, dtype: int64

In [40]:
df_train['Destination'] = df_train['Destination'].fillna('TRAPPIST-1e')
df_test['Destination'] = df_test['Destination'].fillna('TRAPPIST-1e')

In [41]:
df_train['Age'].value_counts()

Age
24.0    324
18.0    320
21.0    311
19.0    293
23.0    292
       ... 
72.0      4
78.0      3
79.0      3
76.0      2
77.0      2
Name: count, Length: 80, dtype: int64

In [42]:
col = ['Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
for i in col:
    df_train[i].fillna(df_train[i].median(), inplace=True)

for i in col:
    df_test[i].fillna(df_test[i].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[i].fillna(df_train[i].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train[i].fillna(df_train[i].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we ar

In [43]:
label = LabelEncoder()

for i in df_train.columns:
    if df_train[i].dtype == object:
        df_train[i] = label.fit_transform(df_train[i])

for i in df_test.columns:
    if df_test[i].dtype == object:
        df_test[i] = label.fit_transform(df_test[i])

In [44]:
df_test.isnull().sum()

HomePlanet      0
CryoSleep       0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
dtype: int64

In [45]:
df_train.corr()['Transported']

HomePlanet      0.111603
CryoSleep       0.455376
Destination    -0.108152
Age            -0.074233
VIP            -0.037261
RoomService    -0.241124
FoodCourt       0.045583
ShoppingMall    0.009391
Spa            -0.218545
VRDeck         -0.204874
Transported     1.000000
Name: Transported, dtype: float64

In [46]:
df_train = df_train.drop('ShoppingMall', axis=1)
df_test = df_test.drop('ShoppingMall', axis=1)

In [47]:
X = df_train.drop('Transported', axis=1)
y = df_train['Transported']

### RandomForest

In [48]:
model = RandomForestClassifier(random_state=42)

In [49]:
param = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [50]:
fool_model = GridSearchCV(model, param_grid=param, cv=5, verbose=2, scoring='accuracy')

In [51]:
fool_model.fit(X, y)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf

In [54]:
y_pred = fool_model.predict(df_test)
df_sub['Transported'] = y_pred

In [55]:
df_sub.to_csv('SpaceshipRandomForest.csv', index=False, index_label=False)

# LogisticRegression

In [48]:
model = LogisticRegression()

In [49]:
param = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2']
}

In [50]:
fool_model = GridSearchCV(model, param_grid=param, cv=5, verbose=2, scoring='accuracy')

In [51]:
fool_model.fit(X, y)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf

In [54]:
y_pred = fool_model.predict(df_test)
df_sub['Transported'] = y_pred

In [55]:
df_sub.to_csv('SpaceshipLogisticRegression.csv', index=False, index_label=False)

# GradientBoost

In [48]:
model = GradientBoostingClassifier()

In [49]:
param = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

In [50]:
fool_model = GridSearchCV(model, param_grid=param, cv=5, verbose=2, scoring='accuracy')

In [51]:
fool_model.fit(X, y)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf

In [54]:
y_pred = fool_model.predict(df_test)
df_sub['Transported'] = y_pred

In [55]:
df_sub.to_csv('SpaceshipGradientBoostingClassifier.csv', index=False, index_label=False)

# AdaBoost

In [48]:
model = AdaBoostClassifier()

In [49]:
param = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.05, 0.1, 0.2]
}

In [50]:
fool_model = GridSearchCV(model, param_grid=param, cv=5, verbose=2, scoring='accuracy')

In [51]:
fool_model.fit(X, y)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf

In [54]:
y_pred = fool_model.predict(df_test)
df_sub['Transported'] = y_pred

In [55]:
df_sub.to_csv('SpaceshipAdaBoostClassifier.csv', index=False, index_label=False)

# CatBoost

In [48]:
model = CatBoostClassifier()

In [49]:
param = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.05, 0.1, 0.2],
    'depth': [4, 6, 8],
}

In [50]:
fool_model = GridSearchCV(model, param_grid=param, cv=5, verbose=2, scoring='accuracy')

In [51]:
fool_model.fit(X, y)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.2s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, min_samples_leaf

In [54]:
y_pred = fool_model.predict(df_test)
df_sub['Transported'] = y_pred

In [55]:
df_sub.to_csv('SpaceshipCatBoost.csv', index=False, index_label=False)