In [7]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/leash-BELKA/sample_submission.csv
/kaggle/input/leash-BELKA/train.parquet
/kaggle/input/leash-BELKA/test.parquet
/kaggle/input/leash-BELKA/train.csv
/kaggle/input/leash-BELKA/test.csv


In [8]:
pip install pandas scikit-learn numpy tqdm pyarrow rdkit


Note: you may need to restart the kernel to use updated packages.


# Incremental Ensemble Model Training

This script trains an ensemble model on a large Parquet dataset using incremental learning techniques:

## Libraries:
- Pandas, Scikit-learn (SGDClassifier, GaussianNB, VotingClassifier), RDKit, PyArrow, tqdm.
  
## Functions:
- Converts SMILES to feature vectors.
- Reads Parquet data in chunks.
- Samples, preprocesses, and encodes categorical variables.
- Trains base models incrementally and creates a soft-voting ensemble.
  


Efficiently handles large datasets while training complex models, ensuring memory and processing efficiency.


In [None]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import numpy as np
from tqdm.notebook import tqdm
import pyarrow.parquet as pq
from rdkit import Chem
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

# Function to convert SMILES to feature vector
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    # Calculate molecular descriptors (example: here using Morgan fingerprints)
    fp = Chem.RDKFingerprint(mol)
    arr = np.zeros((1,))
    Chem.DataStructs.ConvertToNumpyArray(fp, arr)
    
    return arr

# Function to preprocess data
def preprocess_data(data):
    X = []
    y = data['binds'].tolist()
    
    for _, row in tqdm(data.iterrows(), total=len(data)):
        features = smiles_to_features(row['molecule_smiles'])
        if features is not None:
            X.append(features)
    
    return np.array(X), np.array(y)

# Function to read data in chunks using PyArrow
def read_parquet_in_chunks(file_path, chunk_size):
    parquet_file = pq.ParquetFile(file_path)
    num_row_groups = parquet_file.num_row_groups

    for i in range(num_row_groups):
        df = parquet_file.read_row_group(i).to_pandas()
        yield df

# Function to sample and preprocess data chunks
def sample_and_preprocess(file_path, sample_size, chunk_size):
    sampled_chunks = []
    remaining_sample_size = sample_size

    for chunk in read_parquet_in_chunks(file_path, chunk_size):
        if remaining_sample_size <= 0:
            break
        sampled_chunk = chunk.sample(frac=min(remaining_sample_size / len(chunk), 1.0), random_state=42)
        sampled_chunks.append(sampled_chunk)
        remaining_sample_size -= len(sampled_chunk)

    if sampled_chunks:
        sampled_df = pd.concat(sampled_chunks)
        return sampled_df
    else:
        return None

# Function to train ensemble model in chunks
def train_ensemble_in_chunks(file_path, sample_size, chunk_size):
    sampled_df = sample_and_preprocess(file_path, sample_size, chunk_size)
    if sampled_df is None:
        print("No data sampled.")
        return
    
    # Check class distribution
    class_distribution = sampled_df['binds'].value_counts()
    print("Class distribution before training:", class_distribution)
    
    # Ensure there are at least two classes
    if len(class_distribution) < 2:
        raise ValueError("The number of classes has to be greater than one; got {} class".format(len(class_distribution)))

    # Encode categorical variable
    le = LabelEncoder()
    sampled_df['protein_name'] = le.fit_transform(sampled_df['protein_name'])

    # Preprocess train data
    X, y = preprocess_data(sampled_df)

    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define base models with partial_fit method
    sgd_model = SGDClassifier(loss='log', max_iter=1000, tol=1e-3, random_state=42)
    nb_model = GaussianNB()

    # Train base models incrementally
    sgd_model.partial_fit(X_train, y_train, classes=np.unique(y_train))
    nb_model.partial_fit(X_train, y_train, classes=np.unique(y_train))

    # Define ensemble model using VotingClassifier
    ensemble_model = VotingClassifier(estimators=[
        ('sgd', sgd_model),
        ('nb', nb_model)
    ], voting='soft')

    # Fit ensemble model on the training data
    ensemble_model.fit(X_train, y_train)

    # Predict on training data
    y_train_pred = ensemble_model.predict(X_train)

    # Calculate training accuracy
    train_accuracy = accuracy_score(y_train, y_train_pred)
    print("Training accuracy:", train_accuracy)

    # Predict on test data
    y_test_pred = ensemble_model.predict(X_test)

    # Calculate test accuracy
    test_accuracy = accuracy_score(y_test, y_test_pred)
    print("Test accuracy:", test_accuracy)

    return ensemble_model

# Example usage for training ensemble model in chunks
train_file_path = '/kaggle/input/leash-BELKA/train.parquet'
sample_size_train = 100000
chunk_size_train = 5000

best_ensemble_model = train_ensemble_in_chunks(train_file_path, sample_size_train, chunk_size_train)


Class distribution before training: binds
0    99754
1      246
Name: count, dtype: int64


  0%|          | 0/100000 [00:00<?, ?it/s]

# Overview of Evaluating Ensemble Model on Test Data in Chunks

## Problem Description
- Evaluates an ensemble model's performance on test data stored in a Parquet file (`test.parquet`).
- Data is processed in chunks due to its size, ensuring efficient evaluation and prediction.

## Data Preparation
- Reads test data in chunks using PyArrow for efficient memory usage.
- Samples a specified number of data points (`sample_size_test`) from the test dataset.

## Preprocessing
- Converts SMILES representations of molecules into feature vectors using RDKit (Morgan fingerprints).
- Encodes categorical variable (`protein_name`) using `LabelEncoder` for compatibility with machine learning models.

## Model Evaluation
- Evaluates the pre-trained ensemble model on preprocessed test data chunks.
- Predicts binding probabilities using the ensemble model and outputs the results.

## Output
- Saves the predictions to a CSV file named `test_predictions.csv` for further analysis.
- Displays the first few rows of the prediction results to verify output correctness.





In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import pyarrow.parquet as pq
from rdkit import Chem
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier

# Function to convert SMILES to feature vector
def smiles_to_features(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    
    # Calculate molecular descriptors (example: here using Morgan fingerprints)
    fp = Chem.RDKFingerprint(mol)
    arr = np.zeros((1,))
    Chem.DataStructs.ConvertToNumpyArray(fp, arr)
    
    return arr

# Function to preprocess data
def preprocess_data(data):
    X = []
    
    for _, row in tqdm(data.iterrows(), total=len(data)):
        features = smiles_to_features(row['molecule_smiles'])
        if features is not None:
            X.append(features)
    
    return np.array(X)

# Function to read data in chunks using PyArrow
def read_parquet_in_chunks(file_path, chunk_size):
    parquet_file = pq.ParquetFile(file_path)
    num_row_groups = parquet_file.num_row_groups

    for i in range(num_row_groups):
        df = parquet_file.read_row_group(i).to_pandas()
        yield df

# Function to sample and preprocess data chunks
def sample_and_preprocess(file_path, sample_size, chunk_size):
    sampled_chunks = []
    remaining_sample_size = sample_size

    for chunk in read_parquet_in_chunks(file_path, chunk_size):
        if remaining_sample_size <= 0:
            break
        sampled_chunk = chunk.sample(frac=min(remaining_sample_size / len(chunk), 1.0), random_state=42)
        sampled_chunks.append(sampled_chunk)
        remaining_sample_size -= len(sampled_chunk)

    if sampled_chunks:
        sampled_df = pd.concat(sampled_chunks)
        return sampled_df
    else:
        return None

# Function to evaluate model on test data in chunks and predict binding probability
def evaluate_model_in_chunks(model, file_path, sample_size, chunk_size):
    sampled_df = sample_and_preprocess(file_path, sample_size, chunk_size)
    if sampled_df is None:
        print("No data sampled.")
        return
    
    # Encode categorical variable
    le = LabelEncoder()
    sampled_df['protein_name'] = le.fit_transform(sampled_df['protein_name'])

    # Preprocess test data
    X = preprocess_data(sampled_df)

    # Predict binding probability
    y_pred_prob = model.predict_proba(X)[:, 1]

    # Create output DataFrame
    output_df = pd.DataFrame({'id': sampled_df['id'], 'binds': y_pred_prob})

    # Save the output to a CSV file
    output_df.to_csv('test_predictions.csv', index=False)
    print("Predictions saved to test_predictions.csv")

    # Display the first few rows of the output DataFrame
    print(output_df.head())

# Example usage for evaluating ensemble model on test data in chunks
test_file_path = '/kaggle/input/leash-BELKA/test.parquet'
sample_size_test = 1674898
chunk_size_test = 5000

# Assuming you have the best_ensemble_model already trained
evaluate_model_in_chunks(best_ensemble_model, test_file_path, sample_size_test, chunk_size_test)
