In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import os
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from tqdm.notebook import tqdm
tqdm.pandas()

In [3]:
data_dir = './kaggle/input'
df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
df_test = pd.read_csv(os.path.join(data_dir, 'test.csv'))

In [5]:
cat = pd.CategoricalDtype(categories=['A', 'T', 'C', 'G'])

letters = ['A', 'T', 'G', 'C']
max_length = 1


def add_cols(data: pd.DataFrame):
    seq_len = len(data['GuideSeq'].iloc[0])
    
    acc = data
    for l in tqdm(range(1, max_length+1)):
        for pos in tqdm(range(0, seq_len - l + 1)):
            indices = ':'.join(str(i) for i in range(pos, l+pos))
            col_name = f'pos_{l}_{indices}'
            col_data = data["GuideSeq"].str.slice(start=pos, stop=pos+l)
            acc = pd.concat([acc, col_data.rename(col_name).astype('category')], axis=1)
    return acc


data_expanded = add_cols(df_train)
test_data_expanded = add_cols(df_test)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/23 [00:00<?, ?it/s]

In [8]:

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint

epsilon = 0.01
def get_model(X, y, std):
    
    X_train = X
    y_train = y
    std_train = std

    param_distributions = {
        'n_estimators': randint(50, 2000),
        'max_depth': randint(3, 12),
        'learning_rate': uniform(0.01, 0.3),
        'subsample': uniform(0, 1),
        'reg_alpha': uniform(0, 15.0),
        'reg_lambda': uniform(0, 15.0)
    }
    sample_weights = 1 / (std_train + epsilon)

    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', enable_categorical=True)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_distributions,
        n_iter=5000,
        cv=kf,
        random_state=42,
        n_jobs=-1,
        error_score="raise"
    )

    random_search.fit(X_train, y_train, sample_weight=sample_weights)

    best_model = random_search.best_estimator_
    # best_model = xgb_model
    best_model.fit(X_train, y_train, sample_weight=sample_weights)


    xgb_model = xgb.XGBRegressor(enable_categorical=True)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_distributions,
        n_iter=5000,
        cv=kf,
        random_state=42,
        n_jobs=-1,
        error_score="raise"
    )

    random_search.fit(X_train, y_train, sample_weight=std_train)

    best_model = random_search.best_estimator_
    # best_model = xgb_model
    best_model.fit(X_train, y_train, sample_weight=std_train)

    y_pred = best_model.predict(X_train)
    mse = mean_squared_error(y_train, y_pred)
    print(f"Test Mean Squared Error: {mse}")
    return best_model

features = ['Fraction_Insertions', 'Avg_Deletion_Length', 'Fraction_Frameshifts', 'Indel_Diversity']

models = {}

for feature in features:
    models[feature] = get_model(data_expanded.filter(regex='^pos_1'), data_expanded[feature], data_expanded['Indel_Diversity'])


Test Mean Squared Error: 0.00350700079400971




Test Mean Squared Error: 6.149020498259376




Test Mean Squared Error: 0.01004317876587475




Test Mean Squared Error: 0.0004897668182273838


In [10]:
x = pd.DataFrame([{'name': feature, **models[feature].get_params()} for feature in features])
x

Unnamed: 0,name,objective,base_score,booster,callbacks,colsample_bylevel,colsample_bynode,colsample_bytree,device,early_stopping_rounds,...,num_parallel_tree,random_state,reg_alpha,reg_lambda,sampling_method,scale_pos_weight,subsample,tree_method,validate_parameters,verbosity
0,Fraction_Insertions,reg:squarederror,,,,,,,,,...,,,1.463963,10.918536,,,0.968941,,,
1,Avg_Deletion_Length,reg:squarederror,,,,,,,,,...,,,4.509835,8.671126,,,0.375938,,,
2,Fraction_Frameshifts,reg:squarederror,,,,,,,,,...,,,2.571513,6.803685,,,0.454872,,,
3,Indel_Diversity,reg:squarederror,,,,,,,,,...,,,0.236144,0.257247,,,0.916603,,,


In [9]:

t = test_data_expanded.filter(regex='^pos_|Id').copy()

t.insert(0, 'Indel_Diversity', models['Indel_Diversity'].predict(t.filter(regex='^pos_')))
t['Fraction_Insertions'] = models['Fraction_Insertions'].predict(t.filter(regex='^pos_'))
t['Avg_Deletion_Length'] = models['Avg_Deletion_Length'].predict(t.filter(regex='^pos_'))
t['Fraction_Frameshifts'] = models['Fraction_Frameshifts'].predict(t.filter(regex='^pos_'))
t[['Id', 'Fraction_Insertions', 'Avg_Deletion_Length', 'Indel_Diversity', 'Fraction_Frameshifts']].to_csv('./kaggle/output/out.csv', index=False)



Unnamed: 0,Indel_Diversity,Id,pos_1_0,pos_1_1,pos_1_2,pos_1_3,pos_1_4,pos_1_5,pos_1_6,pos_1_7,...,pos_1_16,pos_1_17,pos_1_18,pos_1_19,pos_1_20,pos_1_21,pos_1_22,Fraction_Insertions,Avg_Deletion_Length,Fraction_Frameshifts
0,4.061373,0,T,G,T,G,C,A,A,T,...,C,T,A,A,G,G,G,0.172309,7.136198,0.685497
1,3.049605,1,T,G,T,C,T,G,G,C,...,T,A,C,A,G,G,G,0.344549,6.434199,0.763296
2,3.660302,2,A,C,T,G,A,G,A,G,...,A,A,A,G,T,G,G,0.221601,6.731800,0.695369
3,3.589038,3,G,T,T,C,T,G,C,A,...,T,T,C,A,C,G,G,0.284132,7.139051,0.752764
4,3.870360,4,A,C,T,G,G,A,T,G,...,T,G,G,T,G,G,G,0.183288,8.072345,0.640315
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299,4.222140,299,T,A,T,G,A,A,G,C,...,G,A,G,T,C,G,G,0.083080,6.868105,0.662464
300,3.870834,300,A,T,G,T,T,G,G,A,...,C,G,G,T,G,G,G,0.211601,7.704510,0.660129
301,3.601058,301,G,A,C,C,A,G,G,G,...,T,T,A,A,A,G,G,0.178281,7.387087,0.722990
302,4.042663,302,A,G,T,A,G,A,C,T,...,A,G,G,A,G,G,G,0.165649,8.882483,0.706374
