In [1]:
!pip install tqdm
!pip install kaggle



In [0]:
# from googleapiclient.discovery import build
# import io, os
# from googleapiclient.http import MediaIoBaseDownload
# from google.colab import auth

# auth.authenticate_user()

# drive_service = build('drive', 'v3')
# results = drive_service.files().list(
#         q="name = 'kaggle.json'", fields="files(id)").execute()
# kaggle_api_key = results.get('files', [])

# filename = "/content/.kaggle/kaggle.json"
# os.makedirs(os.path.dirname(filename), exist_ok=True)

# request = drive_service.files().get_media(fileId=kaggle_api_key[0]['id'])
# fh = io.FileIO(filename, 'wb')
# downloader = MediaIoBaseDownload(fh, request)
# done = False
# while done is False:
#     status, done = downloader.next_chunk()
#     print("Download %d%%." % int(status.progress() * 100))
# os.chmod(filename, 600)

In [0]:
# !kaggle competitions download -c ml-mipt-spring2018-1

In [0]:
import pandas as pd
import numpy as np
from copy import deepcopy
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
import xgboost as xgb
from tqdm import tqdm

In [0]:
folder = ".kaggle/competitions/ml-mipt-spring2018-1/"
train_file_path = folder + "train.csv"
test_file_path = folder + "test.csv"

In [0]:
def load_data(filename, sep=','):
    '''
        Функция для загрузки датасета без инициализации класса. 
        
        Возвращает:
         data - полный DataFrame
         fetures - список признаков для обучения/предсказания
         x - датасет для тренировки/предсказания в формате np.array
         y - целевой признак, если он есть. В случае с тестовым датасетом возвращает None
    '''
    data = pd.read_table(filename, sep=sep).dropna()    
    
    data = data[~(data['chainlen']>1000)]

    if ('DSSR' in data.columns):
        data.drop('DSSR', axis=1, inplace=True)    
    
    features = list(deepcopy(data.columns))
    [features.remove(column) for column in ['Id','index', 'pdb_chain', 'mg'] if column in data.columns]
    x_test = np.array(data[features])
    
    chains = np.array(data['pdb_chain'])
    
    try:
        y_test = np.array(data['mg'])
    except: 
        y_test = None
    print('Data loaded')
    return {'data': data, 'features': features, 'x': x_test, 'y': y_test, 'chains': chains}

In [7]:
train_data = load_data(train_file_path, sep='\t')

Data loaded


In [0]:
y = train_data['y']

In [9]:
ratio = len(y[y == 0]) / len(y[y == 1])
print(ratio)

6.369751454257007


In [0]:
def stratified_chain_kfold(data, n_splits=2, shuffle=True, random_state=42):
    """
    Данная функция предназначена для разбиения выборки на тренировочную и тестовую с учетом принадлежности
    объектов (атомов) к цепям.
    Возвращает генератор тренировочных и тестовых индексов.
    """
    chains = np.unique(data['chains'])
    kf = KFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)
    for train_kf, test_kf in kf.split(chains):
        train_chains = chains[train_kf]
        test_chains = chains[test_kf]
        train, test = np.array([], dtype=int), np.array([], dtype=int)
        for chain in train_chains:
            train = np.append(train, np.argwhere(data['chains'] == chain))
        for chain in test_chains:
            test = np.append(test, np.argwhere(data['chains'] == chain))
        yield (train, test)

In [0]:
def cross_val_score(estimator, data, scoring):
    result = []
    for train, test in stratified_chain_kfold(data):
        X_train, y_train = data['x'][train], data['y'][train]
        X_test, y_test = data['x'][test], data['y'][test]
        estimator.fit(X_train, y_train)
        result.append(scoring(y_test, estimator.predict(X_test)))
        print("Cross_val_score", result[-1])
    return np.array(result)

In [0]:
class MyAnotherClassifier:
    def __init__(self, n_estimators=100, random_state=42):
        self.n_estimators = n_estimators
        self.random_state = random_state
    
    def fit(self, X, y):
        np.random.seed(seed=self.random_state)
        self.base_algo = xgb.XGBClassifier(random_state=self.random_state, 
                                           n_estimators=self.n_estimators,
                                           max_depth=9)
        self.base_algo.fit(X, y)
        importances = self.base_algo.feature_importances_
        self.useful_features = np.argwhere(importances > 1 / X.shape[1]).flatten()
        print("Len of useful_features", len(self.useful_features))
        X_train, y_train = X[:, self.useful_features], y
        self.final_algo = xgb.XGBClassifier(random_state=self.random_state, 
                                            scale_pos_weight=ratio, 
                                            n_jobs=-1, 
                                            n_estimators=5 * self.n_estimators)
        self.final_algo.fit(X_train, y_train)
        return self
            
    def predict(self, X):
        X_test = X[:, self.useful_features]
        y = self.final_algo.predict(X_test)
        return y

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
# clf = xgb.XGBClassifier(random_state=42, scale_pos_weight=6, n_jobs=-1)
# print(cross_val_score(clf, train_data, f1_score))
# clf = xgb.XGBClassifier(random_state=42, scale_pos_weight=6, n_jobs=-1, learning_rate=0.15)
# print(cross_val_score(clf, train_data, f1_score))

In [0]:
# %%time
clf = MyAnotherClassifier(n_estimators=50)
# print(cross_val_score(clf, train_data, f1_score))

In [16]:
%%time
clf.fit(train_data['x'], train_data['y'])

Len of useful_features 131
CPU times: user 6min 23s, sys: 711 ms, total: 6min 24s
Wall time: 6min 25s


<__main__.MyAnotherClassifier at 0x7fa9b3cc4908>

In [17]:
test_data = load_data(test_file_path, sep=',')

Data loaded


In [0]:
y_answer = clf.predict(test_data['x'])

In [19]:
len(y_answer[y_answer == 0]) / len(y_answer[y_answer == 1]), ratio 

(5.1010558069381595, 6.369751454257007)

In [0]:
df_answer = pd.read_csv(folder + "sample_submission.csv")
df_answer["mg"] = y_answer

In [21]:
df_answer.tail()

Unnamed: 0,Id,mg
4040,4040,0
4041,4041,0
4042,4042,0
4043,4043,0
4044,4044,0


In [0]:
df_answer.to_csv("to_submit.csv", index=False)

In [0]:
from google.colab import files
files.download("to_submit.csv")