In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
df.isna().sum()

PassengerId       0
HomePlanet      201
CryoSleep       217
Cabin           199
Destination     182
Age             179
VIP             203
RoomService     181
FoodCourt       183
ShoppingMall    208
Spa             183
VRDeck          188
Name            200
Transported       0
dtype: int64

In [None]:
df.shape

(8693, 14)

In [None]:
df_nonan = df.dropna(axis=0)
df_nonan.shape

(6606, 14)

In [None]:
df_nonan.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [None]:
df_nonan.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars'], dtype=object)

In [None]:
df_nonan.Destination.unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e'], dtype=object)

### Label encoder

In [None]:
labelencoder = LabelEncoder()
df_le = df_nonan.copy().drop(['PassengerId', 'Name'], axis=1)
df_le['HomePlanet'] = labelencoder.fit_transform(df_le['HomePlanet'])
df_le['CryoSleep'] = labelencoder.fit_transform(df_le['CryoSleep'])
df_le['Cabin'] = labelencoder.fit_transform(df_le['Cabin'])
df_le['Destination'] = labelencoder.fit_transform(df_le['Destination'])
df_le['VIP'] = labelencoder.fit_transform(df_le['VIP'])

In [None]:
df_le.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6606 entries, 0 to 8692
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   HomePlanet    6606 non-null   int64  
 1   CryoSleep     6606 non-null   int64  
 2   Cabin         6606 non-null   int64  
 3   Destination   6606 non-null   int64  
 4   Age           6606 non-null   float64
 5   VIP           6606 non-null   int64  
 6   RoomService   6606 non-null   float64
 7   FoodCourt     6606 non-null   float64
 8   ShoppingMall  6606 non-null   float64
 9   Spa           6606 non-null   float64
 10  VRDeck        6606 non-null   float64
 11  Transported   6606 non-null   bool   
dtypes: bool(1), float64(6), int64(5)
memory usage: 625.8 KB


In [None]:
df_le

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,1,0,137,2,39.0,0,0.0,0.0,0.0,0.0,0.0,False
1,0,0,1823,2,24.0,0,109.0,9.0,25.0,549.0,44.0,True
2,1,0,1,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0,False
3,1,0,1,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0,False
4,0,0,1825,2,16.0,0,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,1,0,134,0,41.0,1,0.0,6819.0,0.0,1643.0,74.0,False
8689,0,1,4293,1,18.0,0,0.0,0.0,0.0,0.0,0.0,False
8690,0,0,4298,2,26.0,0,0.0,0.0,1872.0,1.0,0.0,True
8691,1,0,1778,0,32.0,0,0.0,1049.0,0.0,353.0,3235.0,False


In [None]:
df_le_X = df_le.iloc[:,:-1].copy()
df_le_y = df_le.iloc[:,-1].copy()

In [None]:
df_le_X_nof = df_le_X.copy().drop('Cabin', axis=1)
df_le_y_nof = df_le_y.copy()

In [None]:
df_le_X_nof.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,1,0,2,39.0,0,0.0,0.0,0.0,0.0,0.0
1,0,0,2,24.0,0,109.0,9.0,25.0,549.0,44.0
2,1,0,2,58.0,1,43.0,3576.0,0.0,6715.0,49.0
3,1,0,2,33.0,0,0.0,1283.0,371.0,3329.0,193.0
4,0,0,2,16.0,0,303.0,70.0,151.0,565.0,2.0


In [None]:
df_test = pd.read_csv('test.csv')

In [None]:
df_test.HomePlanet.value_counts()

Earth     2263
Europa    1002
Mars       925
Name: HomePlanet, dtype: int64

In [None]:
df_test.Destination.value_counts()

TRAPPIST-1e      2956
55 Cancri e       841
PSO J318.5-22     388
Name: Destination, dtype: int64

In [None]:
df_test = pd.read_csv('test.csv')
values = {'Destination': 'TRAPPIST-1e', "HomePlanet": 'Earth' ,"CryoSleep": False, "VIP": False, "Age": df_le_X['Age'].mean(), "RoomService": df_le_X['RoomService'].mean(), "FoodCourt": df_le_X['FoodCourt'].mean(), "ShoppingMall": df_le_X['ShoppingMall'].mean(),"Spa": df_le_X['Spa'].mean(),"VRDeck": df_le_X['VRDeck'].mean(),}
df_test = df_test.fillna(value=values).drop(['Cabin', 'PassengerId', 'Name'], axis=1)
df_test['HomePlanet'] = labelencoder.fit(df_nonan['HomePlanet']).transform(df_test['HomePlanet'])
#df_test['CryoSleep'] = labelencoder.fit(df_nonan['CryoSleep']).transform(df_test.CryoSleep)
df_test['Destination'] = labelencoder.fit(df_nonan['Destination']).transform(df_test['Destination'])
#df_test['VIP'] = labelencoder.fit(df_nonan['VIP']).transform(df_test['VIP'])
df_test.head()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
0,0,True,2,27.0,False,0.0,0.0,0.0,0.0,0.0
1,0,False,2,19.0,False,0.0,9.0,0.0,2823.0,0.0
2,1,True,0,31.0,False,0.0,0.0,0.0,0.0,0.0
3,1,False,2,38.0,False,0.0,6652.0,0.0,181.0,585.0
4,0,False,2,20.0,False,10.0,0.0,635.0,0.0,0.0


In [None]:
labelencoder.fit(df_le['HomePlanet']).classes_

array([0, 1, 2])

In [None]:
df_test.CryoSleep.unique()

array([ True, False])

In [None]:
labelencoder = LabelEncoder()
df_le = df_nonan.copy().drop(['PassengerId', 'Name'], axis=1)
df_le['HomePlanet'] = labelencoder.fit_transform(df_le['HomePlanet'])
df_le['CryoSleep'] = labelencoder.fit_transform(df_le['CryoSleep'])
df_le['Cabin'] = labelencoder.fit_transform(df_le['Cabin'])
df_le['Destination'] = labelencoder.fit_transform(df_le['Destination'])
df_le['VIP'] = labelencoder.fit_transform(df_le['VIP'])

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf_LR_le = LogisticRegression(random_state=0, max_iter=500, C=5).fit(df_le_X, df_le_y)
clf_LR_le.score(df_le_X, df_le_y)

0.7838328792007266

In [None]:
clf_LR_le_nof = LogisticRegression(random_state=0, max_iter=500, C=0.005).fit(df_le_X_nof, df_le_y_nof)
clf_LR_le_nof.score(df_le_X_nof, df_le_y_nof)

0.7909476233726915

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_le_X_nof, df_le_y_nof, test_size=0.4, random_state=0)

In [None]:
clf_LR_le_splited = LogisticRegression(random_state=0, max_iter=500, C=0.005).fit(X_train, y_train)
clf_LR_le_splited.score(X_test, y_test)

0.7854710556186152

In [None]:
labelencoder.fit(df_nonan['CryoSleep']).classes_

array([False, True], dtype=object)

In [None]:
res_LR = clf_LR_le_splited.predict(df_test)

### Solution LR

In [None]:
ans = pd.read_csv('sample_submission.csv')
ans.head()

Unnamed: 0,PassengerId,Transported
0,0013_01,False
1,0018_01,False
2,0019_01,False
3,0021_01,False
4,0023_01,False


In [None]:
#4277
len(res_LR)

4277

In [None]:
#ans['Transported'] = res_LR

In [None]:
#ans.to_csv('/content/result0.csv', index=False)

### trees

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf_tree_le_nof = DecisionTreeClassifier(random_state=0, max_depth=20).fit(df_le_X_nof, df_le_y_nof)
clf_tree_le_nof.score(df_le_X_nof, df_le_y_nof)

0.9220405691795338

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(df_le_X_nof, df_le_y_nof, test_size=0.4, random_state=0)

In [None]:
clf_tree_le_splited = DecisionTreeClassifier(random_state=0,  max_depth=10).fit(X_train, y_train)
clf_tree_le_splited.score(X_test, y_test)

0.7794173287930383

In [None]:
clf_forest_le_nof = RandomForestClassifier(random_state=0, max_depth=7, min_samples_split=7).fit(df_le_X_nof, df_le_y_nof)
clf_forest_le_nof.score(df_le_X_nof, df_le_y_nof)

0.815622161671208

In [None]:
clf_forest_le_splited = RandomForestClassifier(random_state=0,  max_depth=7, min_samples_split=7).fit(X_train, y_train)
clf_forest_le_splited.score(X_test, y_test)

0.7937949300037835

In [None]:
res_forest = clf_forest_le_nof.predict(df_test)

### Solution forest

In [None]:
ans['Transported'] = res_forest
ans.to_csv('/content/result1_forest.csv', index=False)