In [1]:
import numpy as np
import pandas as pd

In [2]:
def status(feature):
    print('Processing', feature, ': ok')

In [3]:
def get_combined_data(train_path, test_path):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    
    targets = train.Transported
    train.drop(['Transported'], 1, inplace=True)
    
    combined = train.append(test)
    combined.reset_index(inplace=True)
    combined.drop(["PassengerId", "RoomService","FoodCourt", 
                   "ShoppingMall", "Spa","VRDeck", "Name", 
                   "VIP","HomePlanet", "Destination", "Age"], inplace=True, axis=1)
    combined['Cabin'] = combined['Cabin'].astype('string')
    combined = combined.dropna(subset=['Cabin'])
    
    return combined, targets

In [4]:
combined, targets = get_combined_data('train.csv', 'test.csv')

  train.drop(['Transported'], 1, inplace=True)
  combined = train.append(test)


In [5]:
deck_dict = {
    'A':'A',
    'B':'B',
    'C':'C',
    'D':'D',
    'E':'E',
    'F':'F',
    'G':'G',
    'T':'T'}

def get_deck(combined, deck_dict):
    combined['Deck'] = combined['Cabin'].map(lambda cabin:cabin[0])
    combined['Deck'] = combined.Deck.map(deck_dict)
    status('Deck')
    return combined

In [6]:
side_dict = {'P': 'P','S': 'S'}

def get_side(combined, side_dict):
    combined['Side'] = combined['Cabin'].map(lambda cabin:cabin.split('/')[2])
    combined['Side'] = combined.Side.map(side_dict)
    combined.drop('Cabin', axis=1, inplace=True)
    status('Side')
    return combined

In [7]:
combined = get_deck(combined, deck_dict)

Processing Deck : ok


In [8]:
combined = get_side(combined, side_dict)

Processing Side : ok


In [9]:
def process_deck(combined):
    
    dummies = pd.get_dummies(combined['Deck'], prefix='Deck')
    combined = pd.concat([combined, dummies], axis=1)
    
    combined.drop('Deck', axis=1, inplace=True)
    
    status('Deck')
    return combined

In [10]:
def process_side(combined):
    
    dummies = pd.get_dummies(combined['Side'], prefix='Side')
    combined = pd.concat([combined, dummies], axis=1)
    
    combined.drop('Side', axis=1, inplace=True)
    
    status('Side')
    return combined

In [11]:
def process_sleep(combined):
    
    dummies = pd.get_dummies(combined['CryoSleep'], prefix='CryoSleep')
    combined = pd.concat([combined, dummies], axis=1)
    
    combined.drop('CryoSleep', axis=1, inplace=True)
    combined.drop('index', axis=1, inplace=True)
    
    status('CryoSleep')
    return combined

In [12]:
combined = process_deck(combined)

Processing Deck : ok


In [13]:
combined = process_sleep(combined)

Processing CryoSleep : ok


In [14]:
combined = process_side(combined)

Processing Side : ok


In [15]:
combined.describe()

Unnamed: 0,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,CryoSleep_False,CryoSleep_True,Side_P,Side_S
count,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0,12671.0
mean,0.027938,0.090048,0.08697,0.056823,0.104412,0.334543,0.298398,0.000868,0.624181,0.352143,0.496409,0.503591
std,0.164801,0.286262,0.281803,0.231513,0.305806,0.471849,0.457573,0.029452,0.484353,0.477657,0.500007,0.500007
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
75%,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [16]:
combined.head()

Unnamed: 0,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T,CryoSleep_False,CryoSleep_True,Side_P,Side_S
0,0,1,0,0,0,0,0,0,1,0,1,0
1,0,0,0,0,0,1,0,0,1,0,0,1
2,1,0,0,0,0,0,0,0,1,0,0,1
3,1,0,0,0,0,0,0,0,1,0,0,1
4,0,0,0,0,0,1,0,0,1,0,0,1


In [17]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [18]:
combined.columns

Index(['Deck_A', 'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G',
       'Deck_T', 'CryoSleep_False', 'CryoSleep_True', 'Side_P', 'Side_S'],
      dtype='object')

In [44]:
def recover_train_test_target(combined):
    
    columns = ['CryoSleep_False', 'CryoSleep_True', 'Side_P', 'Side_S', 'Deck_A',
               'Deck_B', 'Deck_C', 'Deck_D', 'Deck_E', 'Deck_F', 'Deck_G', 'Deck_T']
    combined = combined[columns]
    
    targets = pd.read_csv('train.csv', usecols=['Transported'])['Transported'].values
    train = combined.iloc[:8693]
    test = combined.iloc[8693:]
    
    return train, test, targets

In [45]:
train, test, targets = recover_train_test_target(combined)

In [46]:
train.describe()

Unnamed: 0,CryoSleep_False,CryoSleep_True,Side_P,Side_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
count,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0,8693.0
mean,0.628092,0.347291,0.497872,0.502128,0.030254,0.092028,0.087312,0.056597,0.103532,0.329345,0.300357,0.000575
std,0.483342,0.476137,0.500024,0.500024,0.171296,0.289082,0.282307,0.231085,0.304669,0.470003,0.458439,0.023977
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [47]:
test.describe()

Unnamed: 0,CryoSleep_False,CryoSleep_True,Side_P,Side_S,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_T
count,3978.0,3978.0,3978.0,3978.0,3978.0,3978.0,3978.0,3978.0,3978.0,3978.0,3978.0,3978.0
mean,0.615636,0.362745,0.493213,0.506787,0.022876,0.085721,0.086224,0.057315,0.106335,0.345902,0.294118,0.001508
std,0.486506,0.480853,0.500017,0.500017,0.149526,0.279988,0.28073,0.232473,0.308304,0.475721,0.455702,0.038812
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [48]:
XtrainV, XtestV, ytrainV, ytestV = train_test_split(train, targets, test_size = 0.30)
XtrainV.shape, ytrainV.shape, XtestV.shape, ytestV.shape  

((6085, 12), (6085,), (2608, 12), (2608,))

In [49]:
%%time
from sklearn.naive_bayes import GaussianNB

nbc = GaussianNB()
nbc.fit(XtrainV, ytrainV)
print("Точность для обучающей выборки:", nbc.score(XtrainV, ytrainV))
print("Точность для тестовой выборки:", nbc.score(XtestV, ytestV))

Точность для обучающей выборки: 0.5033689400164338
Точность для тестовой выборки: 0.5061349693251533
CPU times: total: 15.6 ms
Wall time: 186 ms


In [51]:
%%time
from sklearn.svm import SVC

svclassifier = SVC(kernel='rbf')
svclassifier.fit(XtrainV, ytrainV)
print("Точность для обучающей выборки:", svclassifier.score(XtrainV, ytrainV))
print("Точность для тестовой выборки:", svclassifier.score(XtestV, ytestV))

Точность для обучающей выборки: 0.5283483976992605
Точность для тестовой выборки: 0.5153374233128835
CPU times: total: 11.8 s
Wall time: 12.1 s


In [57]:
%%time
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(XtrainV, ytrainV)
print("Точность для обучающей выборки:", lr.score(XtrainV, ytrainV))
print("Точность для тестовой выборки:", lr.score(XtestV, ytestV))

Точность для обучающей выборки: 0.5173377156943303
Точность для тестовой выборки: 0.5
CPU times: total: 625 ms
Wall time: 578 ms


In [66]:
%%time
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=80, max_depth=30, min_samples_leaf=15, min_samples_split=15)
rf.fit(XtrainV, ytrainV)
print("Точность для обучающей выборки:", rf.score(XtrainV, ytrainV))
print("Точность для тестовой выборки:", rf.score(XtestV, ytestV))

Точность для обучающей выборки: 0.5273623664749384
Точность для тестовой выборки: 0.5122699386503068
CPU times: total: 406 ms
Wall time: 386 ms
