In [None]:
import pandas as pd
import numpy as np
from numpy.random import permutation
from shutil import unpack_archive
from itertools import permutations
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [None]:
train_values = pd.read_csv(r'/Users/sabbrielle/Documents/Lambda/train_values.csv', index_col='sequence_id')
train_labels = pd.read_csv(r'/Users/sabbrielle/Documents/Lambda/train_labels.csv', index_col='sequence_id')
test_values = pd.read_csv(r'/Users/sabbrielle/Documents/Lambda/test_values.csv', index_col='sequence_id')

In [None]:
sequence_lengths = train_values.sequence.apply(len)
train_values.iloc[:, 1:].apply(pd.value_counts)

In [None]:
sorted_binary_features = train_values.iloc[:, 1:].mean().sort_values()
lab_ids = pd.DataFrame(train_labels.idxmax(axis=1), columns=['lab_id'])

In [None]:
bases = set(''.join(train_values.sequence.values))
subsequences = [''.join(permutation) for permutation in permutations(bases, r=4)]

In [None]:
def get_ngram_features(data, subsequences):
    """Generates counts for each subsequence.

    Args:
        data (DataFrame): The data you want to create features from. Must include a "sequence" column.
        subsequences (list): A list of subsequences to count.

    Returns:
        DataFrame: A DataFrame with one column for each subsequence.
    """
    features = pd.DataFrame(index=data.index)
    for subseq in subsequences:
        features[subseq] = data.sequence.str.count(subseq)
    return features

In [None]:
ngram_features = get_ngram_features(train_values, subsequences)

In [None]:
all_features = ngram_features.join(train_values.drop('sequence', axis=1))

In [None]:
def top10_accuracy_scorer(estimator, X, y):
    """A custom scorer that evaluates a model on whether the correct label is in 
    the top 10 most probable predictions.

    Args:
        estimator (sklearn estimator): The sklearn model that should be evaluated.
        X (numpy array): The validation data.
        y (numpy array): The ground truth labels.

    Returns:
        float: Accuracy of the model as defined by the proportion of predictions
               in which the correct label was in the top 10. Higher is better.
    """
    # predict the probabilities across all possible labels for rows in our training set
    probas = estimator.predict_proba(X)
    
    # get the indices for top 10 predictions for each row; these are the last ten in each row
    # Note: We use argpartition, which is O(n), vs argsort, which uses the quicksort algorithm 
    # by default and is O(n^2) in the worst case. We can do this because we only need the top ten
    # partitioned, not in sorted order.
    # Documentation: https://numpy.org/doc/1.18/reference/generated/numpy.argpartition.html
    top10_idx = np.argpartition(probas, -10, axis=1)[:, -10:]
    
    # index into the classes list using the top ten indices to get the class names
    top10_preds = estimator.classes_[top10_idx]

    # check if y-true is in top 10 for each set of predictions
    mask = top10_preds == y.reshape((y.size, 1))
    
    # take the mean
    top_10_accuracy = mask.any(axis=1).mean()
 
    return top_10_accuracy

In [None]:
X=all_features
y=lab_ids.values.ravel()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
pca_whiten = PCA(n_components=3, random_state=42, whiten=True)

X_train_pca_2 = X_train.copy()
X_val_pca_2 = X_val.copy()
princ_comp_train_whitened = pca.fit_transform(X_train_pca_2)
princ_comp_val_whitened = pca.transform(X_val_pca_2)

In [None]:
pca_train_whitened = pd.DataFrame(data = princ_comp_train_whitened,
                      columns = ['PC1', 'PC2','PC3'],
                      index=X_train.index)

pca_val_whitened = pd.DataFrame(data = princ_comp_val_whitened,
                      columns = ['PC1', 'PC2','PC3'],
                      index=X_val.index)

In [None]:
X_train_w_pca_whitened = pd.concat([X_train, pca_train_whitened], axis=1)
X_val_w_pca_whitened = pd.concat([X_val, pca_val_whitened], axis=1)

In [None]:
model_xgb_4_w_pca = XGBClassifier(random_state=42, 
                          verbosity=1, 
                          n_jobs=-1,
                          max_delta_step=1)

model_xgb_4_w_pca.fit(X_train_w_pca_whitened, y_train)

In [None]:
print('Training accuracy using XGBoost and 3 (whitened) PCA:', top10_accuracy_scorer(model_xgb_4_w_pca, X_train_w_pca_whitened, y_train))
print('Validation accuracy using XGBoost and 3 (whitened) PCA:', top10_accuracy_scorer(model_xgb_4_w_pca, X_val_w_pca_whitened, y_val))