In [9]:
import pandas as pd
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

# Цели
### Разбить тренировочные данные на train + valid
### Посмотреть среднеквадратическую ошибку
### Пересмотреть препроцессинг
### Использовать другую модель?
### Попробовать кросс валидацию?

In [3]:
def changeSexToBinary(row):
    if row.Sex == 'male': 
        row.Sex = 1
    elif row.Sex == 'female':
        row.Sex = 0
    return row


def data_preprocessing(data):
    current_data = data.drop(['Name', 'Ticket', "Cabin"], axis = 1) #выбрасываем колонки с ненужными данным

    object_cols = ['Embarked'] #имена колонок с категориальными значениями, которые будем преобразовывать в колонки с одним активным состоянием
    imputer = SimpleImputer(strategy='most_frequent') 
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output = False)

    if 'Survived' in current_data.columns: 
        X = current_data.drop(['Survived'], axis = 1)
        y = current_data['Survived']
    else:
        X = current_data

    imputed_X = pd.DataFrame(imputer.fit_transform(X)) 
    imputed_X.columns = X.columns

    OH_cols = pd.DataFrame(OH_encoder.fit_transform(imputed_X[object_cols]))
    num_X = imputed_X.drop(object_cols, axis = 1)
    OH_cols.index = imputed_X.index

    imputed_OH_X = pd.concat([num_X, OH_cols], axis = 1)
    imputed_OH_X = imputed_OH_X.apply(changeSexToBinary, axis = 'columns')

    imputed_OH_X.columns = imputed_OH_X.columns.astype(str)

    if 'Survived' in current_data.columns:
        return imputed_OH_X, y
    else:
        return imputed_OH_X

In [6]:
train_data = pd.read_csv('./data/train.csv')

X_train, y_train = data_preprocessing(train_data)

my_model = LogisticRegression(max_iter=1000)
my_model.fit(X_train, y_train)
X_train.head()


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,0,1,2
0,1,3,1,22.0,1,0,7.25,0.0,0.0,1.0
1,2,1,0,38.0,1,0,71.2833,1.0,0.0,0.0
2,3,3,0,26.0,0,0,7.925,0.0,0.0,1.0
3,4,1,0,35.0,1,0,53.1,0.0,0.0,1.0
4,5,3,1,35.0,0,0,8.05,0.0,0.0,1.0


In [8]:
test_data = pd.read_csv('./data/test.csv')
X_test = data_preprocessing(test_data)

predicts = my_model.predict(X_test)
outp = pd.DataFrame({"PassengerId": test_data['PassengerId'], "Survived": predicts})
outp.to_csv('./submissions/Subm.csv', index=False)