In [118]:
import pandas as pd

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split

In [32]:
train_df = pd.read_csv('./data/train.csv')
train_df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [40]:
new_train_df = train_df.drop(['PassengerId', 'Name'], axis = 'columns')
new_train_df.head()

nan_col_list = new_train_df.columns[new_train_df.isna().any()].tolist()
print(nan_col_list)

['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']


In [137]:
def bool_to_int_rows(row):
    
    if row.CryoSleep:
        row.CryoSleep = 1
    else:
        row.CryoSleep = 0

    if row.VIP:
        row.VIP = 1
    else:
        row.VIP = 0

    # if row.Transported:
    #     row.Transported = 1
    # else:
    #     row.Transported = 0
        
    return row
        

In [112]:
def prev_data_preprocessing(data):

    # Заменим все пропущенные значения-----------------------
    
    current_data = data.drop(['PassengerId','Cabin', 'Name'], axis = 1)

    my_s_imputer_1 = SimpleImputer(strategy="most_frequent")
    my_s_imputer_2 = SimpleImputer(strategy='mean')

    nan_col_list_1 = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
    nan_col_list_2 = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

    nan_df_1 = current_data[nan_col_list_1]
    nan_df_2 = current_data[nan_col_list_2]

    imputed_df_1 = pd.DataFrame(my_s_imputer_1.fit_transform(nan_df_1))
    imputed_df_1.columns = nan_df_1.columns

    imputed_df_2 = pd.DataFrame(my_s_imputer_2.fit_transform(nan_df_2))
    imputed_df_2.columns = nan_df_2.columns

    current_data[nan_col_list_1] = imputed_df_1
    current_data[nan_col_list_2] = imputed_df_2
    
    #---------------------------------------------------------

    current_data = current_data.apply(bool_to_int_rows, axis='columns')
    
    my_OH_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore').set_output(transform='pandas')

    obj_df = current_data.select_dtypes(include='object')

    new_obj_df = pd.DataFrame(my_OH_encoder.fit_transform(obj_df))
    
    current_data = current_data.drop(obj_df.columns, axis = 1)
    return_data = pd.concat([current_data, new_obj_df], axis = 1)
    return return_data

In [158]:
train_data = prev_data_preprocessing(train_df)

X = train_data.drop(['Transported'], axis = 1)
y = train_data['Transported']
y = y.apply(lambda x: 1 if x else 0)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.15, random_state = 2)

my_model = LogisticRegression(max_iter = 1000)
my_model.fit(X_train, y_train)
preds = my_model.predict(X_valid)
print(accuracy_score(preds, y_valid))

0.7921779141104295


In [155]:
test_data = pd.read_csv('./data/test.csv')

X_test = prev_data_preprocessing(test_data)

answer_predicts = my_model.predict(X_test)
answer_predicts = pd.Series(answer_predicts)
output = pd.DataFrame({"PassengerId": test_data['PassengerId'], 'Transported': answer_predicts})
output['Transported'] = output['Transported'].apply(lambda x: True if x else False)
output.head()
output.to_csv('./output/FirstSubmission.csv', index = False)