In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import numpy as np
import pandas as pd
import os
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import product
from tqdm.notebook import tqdm
tqdm.pandas()

In [5]:
data_dir = './kaggle/input'
df_train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
df_test = pd.read_csv(os.path.join(data_dir, 'test.csv'))

In [6]:
df_train

Unnamed: 0,Id,GuideSeq,Fraction_Insertions,Avg_Deletion_Length,Indel_Diversity,Fraction_Frameshifts
0,0,CTGCAGGGCTAGTTTCCTATAGG,0.069572,4.301844,3.536538,0.807375
1,1,GAGATGCGGACCACCCAGCTGGG,0.287647,10.814444,3.871165,0.665696
2,2,GCAAACGGAAGTGCAATTGTCGG,0.137004,9.888889,3.931298,0.684823
3,3,GTCATCGCTGAGTTGAGGAAGGG,0.093889,4.527812,3.523067,0.753003
4,4,ATATGATTATCCCTGCACAAGGG,0.526525,6.415644,2.828101,0.887214
...,...,...,...,...,...,...
1060,1060,ATGCCCGACCAAAGACAACCAGG,0.093236,3.641787,3.170302,0.684801
1061,1061,CACGCTGTCATCCACCAGGTAGG,0.069054,7.661831,4.284711,0.639433
1062,1062,GGGCTCCAGAGTCTGATACAGGG,0.421439,7.364856,3.175859,0.832975
1063,1063,AAAGACTTCGGTCCTCTAGTAGG,0.206277,7.907886,3.782588,0.528375


In [50]:
cat = pd.CategoricalDtype(categories=['A', 'T', 'C', 'G'])

def add_cols(df: pd.DataFrame):
    split_columns = df['GuideSeq'].apply(lambda x: pd.Series(list(x)))
    split_columns.columns = [f'pos_{i+1}' for i in range(split_columns.shape[1])]
    exp = pd.concat([df, split_columns], axis=1)
    for col in exp.columns:
        if col.startswith('pos_'):
            exp[col] = exp[col].astype(cat)
    return exp

data_expanded = add_cols(df_train)
test_data_expanded = add_cols(df_test)


In [51]:

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import mean_squared_error
from scipy.stats import uniform, randint

# Select columns that start with 'pos_' for training
X = data_expanded.filter(regex='^pos_')

def get_model(param):
    y = data_expanded[param]

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    param_distributions = {
        'n_estimators': randint(50, 100),  # Reduced range for efficiency
        'max_depth': randint(3, 7),  # Limited depth to prevent overfitting
        'learning_rate': uniform(0.01, 0.1),  # Stable range for learning rate
        'subsample': uniform(0, 1),  # Corrected range [0, 1] for subsample
        'colsample_bytree': uniform(0, 1),  # Corrected range [0, 1] for colsample_bytree
        'reg_alpha': uniform(0, 1.0),  # Regularization parameters
        'reg_lambda': uniform(1.0, 3.0)
    }

    # Initialize the XGBoost model with enable_categorical=True
    xgb_model = xgb.XGBRegressor(objective='reg:squarederror', enable_categorical=True)
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    random_search = RandomizedSearchCV(
        estimator=xgb_model,
        param_distributions=param_distributions,
        n_iter=50,
        scoring='neg_mean_squared_error',
        cv=kf,  # Use the KFold object here
        random_state=42,
        n_jobs=-1,
        error_score="raise"
    )

    # Fit the randomized search on the training data
    random_search.fit(X_train, y_train)

    # Display the best parameters and best score
    # print("Best Parameters:", random_search.best_params_)
    # print("Best Score (MSE):", -random_search.best_score_)

    # Train the best model on the training data
    best_model = random_search.best_estimator_
    best_model.fit(X_train, y_train)

    # Predict on the test set and evaluate
    y_pred = best_model.predict(X_test)
    mean_squared_error(y_test, y_pred)
    # print(f"Test Mean Squared Error: {mse}")
    return best_model

features = ['Fraction_Insertions', 'Avg_Deletion_Length', 'Indel_Diversity', 'Fraction_Frameshifts']
models = {}
for feature in features:
    y_train = data_expanded[feature]
    models[feature] = get_model(feature)
    print(f"Trained model for {feature} with best parameters: {models[feature].get_params()}")



Trained model for Fraction_Insertions with best parameters: {'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': np.float64(0.659984046034179), 'device': None, 'early_stopping_rounds': None, 'enable_categorical': True, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': np.float64(0.09172222002012158), 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 3, 'max_leaves': None, 'min_child_weight': None, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': 98, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': None, 'reg_alpha': np.float64(0.24185229090045168), 'reg_lambda': np.float64(1.2793083034176975), 'sampling_method': None, 'scale_pos_weight': None, 'subsample': np.float64(0.89

In [56]:

X_test = test_data_expanded.filter(regex='^pos_')

predictions = {}
for feature, model in models.items():
    df_test[feature] = model.predict(X_test)


Unnamed: 0,Id,GuideSeq,Fraction_Insertions,Avg_Deletion_Length,Indel_Diversity,Fraction_Frameshifts
0,0,TGTGCAATATCTGGTACTAAGGG,0.179900,7.735808,3.892301,0.720439
1,1,TGTCTGGCCAGCAGAATACAGGG,0.342629,6.418310,3.319548,0.779486
2,2,ACTGAGAGTGGATCCGAAAGTGG,0.259423,6.334791,3.462435,0.700852
3,3,GTTCTGCACCAGCACATTCACGG,0.276138,7.440110,3.633102,0.763192
4,4,ACTGGATGGACAAGACTGGTGGG,0.193415,7.369278,3.914264,0.674920
...,...,...,...,...,...,...
299,299,TATGAAGCATGGGTCTGAGTCGG,0.086350,6.339442,4.184672,0.658556
300,300,ATGTTGGACAGCTGCCCGGTGGG,0.164188,6.913840,3.575523,0.675335
301,301,GACCAGGGCCACATCTTTAAAGG,0.174302,7.113712,3.775306,0.725926
302,302,AGTAGACTGACCTGGTAGGAGGG,0.240255,8.443419,4.086989,0.701058


In [62]:
df_test.drop(columns=['GuideSeq']).to_csv('./kaggle/output/out.csv', index=False)