In [1]:
import pandas as pd


from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer

# Цели
### Pipeline
### Пересмотреть препроцессинг (может быть, стоит разделить возраст людей на группы?)
### Попробовать кросс валидацию?

In [2]:
def changeSexToBinary(row):
    if row.Sex == 'male': 
        row.Sex = 1
    elif row.Sex == 'female':
        row.Sex = 0
    return row


def data_preprocessing(data):
    current_data = data.drop(['Name', 'Ticket', "Cabin"], axis = 1) #выбрасываем колонки с ненужными данным

    object_cols = ['Embarked'] #имена колонок с категориальными значениями, которые будем преобразовывать в колонки с одним активным состоянием
    imputer = SimpleImputer(strategy='most_frequent') 
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse_output = False)

    if 'Survived' in current_data.columns: 
        X = current_data.drop(['Survived'], axis = 1)
        y = current_data['Survived']
    else:
        X = current_data

    imputed_X = pd.DataFrame(imputer.fit_transform(X)) 
    imputed_X.columns = X.columns

    OH_cols = pd.DataFrame(OH_encoder.fit_transform(imputed_X[object_cols]))
    num_X = imputed_X.drop(object_cols, axis = 1)
    OH_cols.index = imputed_X.index

    imputed_OH_X = pd.concat([num_X, OH_cols], axis = 1)
    imputed_OH_X = imputed_OH_X.apply(changeSexToBinary, axis = 'columns')

    imputed_OH_X.columns = imputed_OH_X.columns.astype(str)

    if 'Survived' in current_data.columns:
        return imputed_OH_X, y
    else:
        return imputed_OH_X

In [38]:
train_data = pd.read_csv('./data/train.csv')

X, y = data_preprocessing(train_data)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
my_model = LogisticRegression(max_iter=10000);
my_model.fit(X_train, y_train);

preds = my_model.predict(X_valid);
print(accuracy_score(preds, y_valid));


# test_data = pd.read_csv('./data/test.csv')
# X_test = data_preprocessing(test_data)

# predicts = my_model.predict(X_test)
# outp = pd.DataFrame({"PassengerId": test_data['PassengerId'], "Survived": predicts})
# outp.to_csv('./submissions/SubmissionLogFinal6.csv', index=False)



# my_model = XGBClassifier(max_depth = 2, learning_rate = 0.1, early_stopping_rounds = 5)
# my_model.fit(X_train, y_train,
#             eval_set = [(X_valid, y_valid)],
#             verbose=False);

# curr_preds = my_model.predict(X_valid)
# print(accuracy_score(curr_preds, y_valid))

0.8044692737430168


In [4]:
def testXGBClass(max_depth:int, learning_rate: float, early_stopping_rounds: int)->float:
    X_valid_1, X_valid_2, y_valid_1, y_valid_2 = train_test_split(X_valid, y_valid, test_size = 0.5, random_state = 1)
    
    my_model = XGBClassifier(max_depth = max_depth, learning_rate = learning_rate, early_stopping_rounds = early_stopping_rounds)
    my_model.fit(X_train, y_train,
                eval_set=[(X_valid_1, y_valid_1)],
                verbose=False)
    
    curr_preds = my_model.predict(X_valid_2)
    return accuracy_score(curr_preds, y_valid_2)
    

In [5]:
max_depth_list = [i for i in range (2, 25)]
learning_rate_list = [0.03, 0.05, 0.07, 0.1, 0.12, 0.15, 0.18, 0.2, 0.22, 0.25, 0.27, 0.3 ]
early_stopping_rounds_list = [i for i in range(3, 15, 2)]

max_accuracy = 0
ans_list = []

for curr_depth in max_depth_list:
    for curr_rate in learning_rate_list:
        for curr_rounds in early_stopping_rounds_list:
            curr_acc_score = testXGBClass(curr_depth, curr_rate, curr_rounds)
            if curr_acc_score > max_accuracy:
                max_accuracy = curr_acc_score
                ans_list = [curr_depth, curr_rate, curr_rounds]
print(max_accuracy, ans_list)

KeyboardInterrupt: 

In [None]:
# my_model = XGBClassifier(max_depth = 3, learning_rate = 0.3, early_stopping_rounds = 5)
# my_model.fit(X_train, y_train, 
#             eval_set=[(X_valid, y_valid)],
#             verbose=False)

test_data = pd.read_csv('./data/test.csv')
X_test = data_preprocessing(test_data)

predicts = my_model.predict(X_test)
outp = pd.DataFrame({"PassengerId": test_data['PassengerId'], "Survived": predicts})
outp.to_csv('./submissions/SubmissionXGBFinal.csv', index=False)