This notebook performs ANOVA analysis on the 20% data split from 60 - 20 - 20 train-ANOVA-test split obtained from xgb notebook. By reading the 9959301 kmer features into chunks of size 30000, and use ANOVA to get the top 10 features in each chunk, the notebook ends with selected top 3320 features for gradient-boosted model and SVM training.

In [1]:
import pandas as pd
import pyarrow.parquet as pq
import pyarrow as pa
import numpy as np
from sklearn.model_selection import (
    GridSearchCV,
    RandomizedSearchCV,
    cross_val_score,
    cross_validate,
    train_test_split,
)
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import shap
# import eli5

In [1]:
print(1)

1


In [3]:
chunks = pd.read_csv("ANOVA_before_transpose.csv", chunksize = 40000)

In [4]:
df_species = pd.read_parquet('final_df_species.parquet')
df_species = df_species.T
df_species.rename(columns={0: 'species'}, inplace=True)

In [5]:
df_species

Unnamed: 0,species
GCF_900446045.1_58932_D01.genomic.fna.gz,Brucella abortus
GCF_005938105.1_ASM593810v1.genomic.fna.gz,Brucella haematophila
GCF_000413955.1_Bruc_abor_biovar_1_B10-0091_V1.genomic.fna.gz,Brucella abortus
GCF_022024335.1_ASM2202433v1.genomic.fna.gz,Brucella pseudogrignonensis
GCF_002191835.1_ASM219183v1.genomic.fna.gz,Brucella melitensis
...,...
GCF_000698245.1_ASM69824v1.genomic.fna.gz,Brucella suis
GCF_003993895.1_ASM399389v1.genomic.fna.gz,Brucella melitensis
GCF_023651035.1_ASM2365103v1.genomic.fna.gz,Brucella abortus
GCF_017177355.1_ASM1717735v1.genomic.fna.gz,Brucella melitensis


In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

In [7]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [8]:
df_species_enc = MultiColumnLabelEncoder(columns = ['species']).fit_transform(df_species)
df_species_enc

Unnamed: 0,species
GCF_900446045.1_58932_D01.genomic.fna.gz,0
GCF_005938105.1_ASM593810v1.genomic.fna.gz,10
GCF_000413955.1_Bruc_abor_biovar_1_B10-0091_V1.genomic.fna.gz,0
GCF_022024335.1_ASM2202433v1.genomic.fna.gz,23
GCF_002191835.1_ASM219183v1.genomic.fna.gz,14
...,...
GCF_000698245.1_ASM69824v1.genomic.fna.gz,26
GCF_003993895.1_ASM399389v1.genomic.fna.gz,14
GCF_023651035.1_ASM2365103v1.genomic.fna.gz,0
GCF_017177355.1_ASM1717735v1.genomic.fna.gz,14


In [9]:
selected_features = []

In [10]:
for chunk in chunks:
    column_name = chunk.columns.values.tolist()[0]
    chunk = chunk.set_index(column_name)
    chunk.index.name = None
    X = chunk.T
    data = X.join(df_species)
    x_SHAP = data.drop(columns=["species"])
    y_SHAP = data["species"]
    rf_model = RandomForestClassifier(random_state=123, n_jobs=-1).fit(x_SHAP,y_SHAP)
    explainer = shap.TreeExplainer(rf_model)
    shap_values = explainer(x_SHAP)
    np.abs(shap_values.values)
    checker = shap_values.values
    feature_importance = pd.DataFrame(list(zip(x_SHAP.columns, shap_values.mean(0))), columns = ['feature','importance'])
    feature_importance = feature_importance.sort_values('importance', ascending = False)
    top_features = feature_importance.head(10)['feature'].values
    
    
#     selector = SelectKBest(score_func = f_classif, k = 10)
#     X_selected = selector.fit_transform(x_ANOVA, y_ANOVA)
#     bbb = list(np.array(x_ANOVA.columns.values.tolist())[selector.get_support()])
#     for i in bbb:
#         selected_features.append(i)

    for i in top_features:
        selected_features.append(i)

In [11]:
checker

array([[[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0.

In [12]:
# print(selected_features)

In [13]:
# final_selected_features