In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
import xgboost as xgb
from tqdm import tqdm

In [2]:
folder = "./"
train_file_path = folder + "train.csv"
test_file_path = folder + "test.csv"

In [3]:
def load_data(filename, sep=','):
    '''
        Функция для загрузки датасета без инициализации класса. 
        
        Возвращает:
         data - полный DataFrame
         fetures - список признаков для обучения/предсказания
         x - датасет для тренировки/предсказания в формате np.array
         y - целевой признак, если он есть. В случае с тестовым датасетом возвращает None
    '''
    data = pd.read_table(filename, sep=sep).dropna()    
    
    data = data[~(data['chainlen']>1000)]

    if ('DSSR' in data.columns):
        data.drop('DSSR', axis=1, inplace=True)    
    
    features = list(deepcopy(data.columns))
    [features.remove(column) for column in ['Id','index', 'pdb_chain', 'mg'] if column in data.columns]
    x_test = np.array(data[features])
    
    chains = np.array(data['pdb_chain'])
    
    try:
        y_test = np.array(data['mg'])
    except: 
        y_test = None
    print('Data loaded')
    return {'data': data, 'features': features, 'x': x_test, 'y': y_test, 'chains': chains}

In [4]:
train_data = load_data(train_file_path, sep='\t')

Data loaded


In [5]:
y = train_data['y']

In [6]:
ratio = len(y[y == 0]) / len(y[y == 1]) 

In [7]:
def stratified_chain_kfold(data, n_splits=2, shuffle=True, random_state=42):
    """
    Данная функция предназначена для разбиения выборки на тренировочную и тестовую с учетом принадлежности
    объектов (атомов) к цепям.
    Возвращает генератор тренировочных и тестовых индексов.
    """
    chains = np.unique(data['chains'])
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    for train_kf, test_kf in kf.split(chains):
        train_chains = chains[train_kf]
        test_chains = chains[test_kf]
        train, test = np.array([], dtype=int), np.array([], dtype=int)
        for chain in train_chains:
            train = np.append(train, np.argwhere(data['chains'] == chain))
        for chain in test_chains:
            test = np.append(test, np.argwhere(data['chains'] == chain))
        yield (train, test)

In [8]:
def cross_val_score(estimator, data, scoring):
    result = []
    for train, test in tqdm(stratified_chain_kfold(data)):
        X_train, y_train = data['x'][train], data['y'][train]
        X_test, y_test = data['x'][test], data['y'][test]
        estimator.fit(X_train, y_train)
        result.append(scoring(y_test, estimator.predict(X_test)))
        print(result[-1])
    return np.array(result)

In [None]:
clf = xgb.XGBClassifier(random_state=42, scale_pos_weight=ratio, learning_rate=0.05)
cross_val_score(clf, train_data, f1_score)

0it [00:00, ?it/s]

In [None]:
clf.fit(train_data['x'], train_data['y'])

In [None]:
test_data = load_data(test_file_path, sep=',')

In [None]:
y_answer = clf.predict(test_data['x'])

In [None]:
df_answer = pd.read_csv("sample_submission.csv")
df_answer["mg"] = y_answer

In [None]:
df_answer.tail()

In [None]:
df_answer.to_csv("to_submit.csv", index=False)