# FeatureBook

## Imports

In [1]:
import os
import gc
import sys
import random
import numpy as np
import pandas as pd

## Load Original Data

In [2]:
# Load competition dataset
#df_dipole_moments = pd.read_csv('./input/dipole_moments.csv')
#df_mulliken_charges = pd.read_csv('./input/mulliken_charges.csv')
df_structures = pd.read_csv('./input/structures.csv')
test = pd.read_csv('./input/test.csv')
train = pd.read_csv('./input/train.csv')

In [3]:
print(train)

              id     molecule_name  atom_index_0  atom_index_1  type  \
0              0  dsgdb9nsd_000001             1             0  1JHC   
1              1  dsgdb9nsd_000001             1             2  2JHH   
2              2  dsgdb9nsd_000001             1             3  2JHH   
3              3  dsgdb9nsd_000001             1             4  2JHH   
4              4  dsgdb9nsd_000001             2             0  1JHC   
5              5  dsgdb9nsd_000001             2             3  2JHH   
6              6  dsgdb9nsd_000001             2             4  2JHH   
7              7  dsgdb9nsd_000001             3             0  1JHC   
8              8  dsgdb9nsd_000001             3             4  2JHH   
9              9  dsgdb9nsd_000001             4             0  1JHC   
10            10  dsgdb9nsd_000002             1             0  1JHN   
11            11  dsgdb9nsd_000002             1             2  2JHH   
12            12  dsgdb9nsd_000002             1             3  

## Handle y

In [4]:
storage_path = './featurebook/'

# Initialize storage directory
if not os.path.exists(storage_path):
    os.mkdir(storage_path)

# Initialize y
y = train['scalar_coupling_constant']
y = y.replace([np.inf, -np.inf], np.nan)
y = y.reset_index()
y = y.drop(['index'], axis = 1)
train.drop(columns=['scalar_coupling_constant'], inplace=True)

y.to_feather(storage_path + 'y.feather')
y.to_pickle(storage_path + 'y.pkl')

del y
gc.collect()

14

## Save Checkpoint

In [5]:
save_count = 1
save_feather = False
save_pickle = True

def save_checkpoint(new_dir, train, test):
    # Initialize new output directory
    if not os.path.exists(storage_path + new_dir):
        os.mkdir(storage_path + new_dir)
    
    cat_columns_train = train.select_dtypes(['category']).columns
    train[cat_columns_train] = train[cat_columns_train].apply(lambda x: x.cat.codes)

    # Create dataframe X_test
    X_test = test.copy()
    cat_columns_X_test = X_test.select_dtypes(['category']).columns
    X_test[cat_columns_X_test] = X_test[cat_columns_X_test].apply(lambda x: x.cat.codes)
    
    # Write the output as binary
    if save_feather:
        train.to_feather(storage_path + new_dir + 'X.feather')
        X_test.to_feather(storage_path + new_dir + 'X_test.feather')
    if save_pickle:
        train.to_pickle(storage_path + new_dir + 'X.pkl')
        X_test.to_pickle(storage_path + new_dir + 'X_test.pkl')

## Memory Optimization

In [6]:
def cast_float16(train, test, columns):
    for column in columns:
        train[column] = train[column].astype(np.float16)
        test[column] = test[column].astype(np.float16)
    
    return train, test

## Generate Features

### Distance Matrix

In [7]:
def distance_matrix(train, test, save_count):
    def map_atom_info(df, df_dm, atom_idx):
        df = pd.merge(df, df_dm, how = 'left',
                      left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                      right_on = ['molecule_name',  'atom_index'])

        df = df.drop('atom_index', axis=1)

        return df
    
    def rename_cols(df):
        df.rename(columns={
            'atom_x': 'ai0_atom', 'atom_y': 'ai1_atom',
            '0_x': 'ai0_0', '0_y': 'ai1_0',
            '1_x': 'ai0_1', '1_y': 'ai1_1',
            '2_x': 'ai0_2', '2_y': 'ai1_2',
            '3_x': 'ai0_3', '3_y': 'ai1_3',
            '4_x': 'ai0_4', '4_y': 'ai1_4',
            '5_x': 'ai0_5', '5_y': 'ai1_5',
            '6_x': 'ai0_6', '6_y': 'ai1_6',
            '7_x': 'ai0_7', '7_y': 'ai1_7',
            '8_x': 'ai0_8', '8_y': 'ai1_8',
            '9_x': 'ai0_9', '9_y': 'ai1_9',
            '10_x': 'ai0_10', '10_y': 'ai1_10',
            '11_x': 'ai0_11', '11_y': 'ai1_11',
            '12_x': 'ai0_12', '12_y': 'ai1_12',
            '13_x': 'ai0_13', '13_y': 'ai1_13',
            '14_x': 'ai0_14', '14_y': 'ai1_14',
            '15_x': 'ai0_15', '15_y': 'ai1_15',
            '16_x': 'ai0_16', '16_y': 'ai1_16',
            '17_x': 'ai0_17', '17_y': 'ai1_17',
            '18_x': 'ai0_18', '18_y': 'ai1_18',
            '19_x': 'ai0_19', '19_y': 'ai1_19',
            '20_x': 'ai0_20', '20_y': 'ai1_20',
            '21_x': 'ai0_21', '21_y': 'ai1_21',
            '22_x': 'ai0_22', '22_y': 'ai1_22',
            '23_x': 'ai0_23', '23_y': 'ai1_23',
            '24_x': 'ai0_24', '24_y': 'ai1_24',
            '25_x': 'ai0_25', '25_y': 'ai1_25',
            '26_x': 'ai0_26', '26_y': 'ai1_26',
            '27_x': 'ai0_27', '27_y': 'ai1_27',
            '28_x': 'ai0_28', '28_y': 'ai1_28',
        }, inplace=True)
    
    # Load dataframe
    df_distance_matrix = pd.read_csv('./input/generated/distance_matrix.csv')

    train = map_atom_info(train, df_distance_matrix, 0)
    train = map_atom_info(train, df_distance_matrix, 1)

    test = map_atom_info(test, df_distance_matrix, 0)
    test = map_atom_info(test, df_distance_matrix, 1)

    train = train.drop(['id'], axis = 1)
    test = test.drop(['id'], axis = 1)

    rename_cols(train)
    rename_cols(test)
    
    train, test = cast_float16(train, test, [
        'ai0_0', 'ai0_1', 'ai0_2', 'ai0_3', 'ai0_4', 'ai0_5', 'ai0_6', 'ai0_7', 'ai0_8', 'ai0_9', 'ai0_10', 'ai0_11', 'ai0_12', 'ai0_13', 'ai0_14', 'ai0_15', 'ai0_16', 'ai0_17', 'ai0_18', 'ai0_19', 'ai0_20',
        'ai0_21', 'ai0_22', 'ai0_23', 'ai0_24', 'ai0_25', 'ai0_26', 'ai0_27', 'ai0_28', 'ai1_0', 'ai1_1', 'ai1_2', 'ai1_3', 'ai1_4', 'ai1_5', 'ai1_6', 'ai1_7', 'ai1_8', 'ai1_9', 'ai1_10', 'ai1_11', 'ai1_12',
        'ai1_13', 'ai1_14', 'ai1_15', 'ai1_16', 'ai1_17', 'ai1_18', 'ai1_19', 'ai1_20', 'ai1_21', 'ai1_22', 'ai1_23', 'ai1_24', 'ai1_25', 'ai1_26', 'ai1_27', 'ai1_28'
    ])
    
    save_checkpoint(f'{save_count}_distance-matrix/', train, test)
    save_count += 1
    print('Distance matrix added to dataset.')
    
    del df_distance_matrix
    gc.collect()
    
    return train, test, save_count

### Dipole Moments

In [8]:
def dipole_moments(train, test, save_count):
    def map_atom_info(df, df_dm, atom_idx):
        df = pd.merge(df, df_dm, how = 'left',
                      left_on  = ['molecule_name'],
                      right_on = ['molecule_name'])

        return df

    def rename_cols(df):
        df.rename(columns={
            'X': 'dm_X',
            'Y': 'dm_Y',
            'Z': 'dm_Z',
        }, inplace=True)
    
    # Load dataframes
    df_dipole_moments_train = pd.read_csv('./input/generated/train_ob_dipoles_mmff44.csv')
    df_dipole_moments_train = df_dipole_moments_train.loc[:, ~df_dipole_moments_train.columns.str.contains('^Unnamed')]
    df_dipole_moments_test = pd.read_csv('./input/generated/test_ob_dipoles_mmff44.csv')
    df_dipole_moments_test = df_dipole_moments_test.loc[:, ~df_dipole_moments_test.columns.str.contains('^Unnamed')]

    train = map_atom_info(train, df_dipole_moments_train, 0)
    test = map_atom_info(test, df_dipole_moments_test, 0)
    
    rename_cols(train)
    rename_cols(test)
    
    train, test = cast_float16(train, test, [
        'dm_X', 'dm_Y', 'dm_Z'
    ])
    
    save_checkpoint(f'{save_count}_dipole-moments/', train, test)
    save_count += 1
    print('Dipole moments added to dataset.')
    
    del df_dipole_moments_train
    del df_dipole_moments_test
    gc.collect()
    
    return train, test, save_count

### Mulliken Charges

In [9]:
def mulliken_charges(train, test, save_count):
    def map_atom_info(df, df_mc, atom_idx):
        df = pd.merge(df, df_mc, how = 'left',
                      left_on  = ['molecule_name', f'atom_index_{atom_idx}'],
                      right_on = ['molecule_name',  'atom_index'])

        df = df.drop('atom_index', axis=1)

        return df
    
    def rename_cols(df):
        df.rename(columns={
            'eem_x': 'mc0_eem', 'eem2015bn_x': 'mc0_eem2015bn',
            'eem_y': 'mc1_eem', 'eem2015bn_y': 'mc1_eem2015bn',
        }, inplace=True)
    
    df_mulliken_charges_train = pd.read_csv('./input/generated/best_ob_mulliken.csv')
    df_mulliken_charges_train = df_mulliken_charges_train.loc[:, ~df_mulliken_charges_train.columns.str.contains('^Unnamed')]
    train = map_atom_info(train, df_mulliken_charges_train, 0)
    train = map_atom_info(train, df_mulliken_charges_train, 1)
    del df_mulliken_charges_train
    gc.collect()
    
    df_mulliken_charges_test = pd.read_csv('./input/generated/best_ob_mulliken_test.csv')
    df_mulliken_charges_test = df_mulliken_charges_test.loc[:, ~df_mulliken_charges_test.columns.str.contains('^Unnamed')]
    test = map_atom_info(train, df_mulliken_charges_test, 0)
    test = map_atom_info(train, df_mulliken_charges_test, 1)
    del df_mulliken_charges_test
    gc.collect()
    
    train = train.dropna(axis='columns')
    test = test.dropna(axis='columns')
    
    rename_cols(train)
    rename_cols(test)
    
    train, test = cast_float16(train, test, [
        'mc0_eem', 'mc1_eem', 'mc0_eem2015bn', 'mc1_eem2015bn'
    ])
    
    save_checkpoint(f'{save_count}_mulliken-charges/', train, test)
    save_count += 1
    print('Mulliken charges added to dataset.')
    
    return train, test, save_count

### CHAMPS
Source: https://www.kaggle.com/todnewman/keras-neural-net-for-champs

In [10]:
def champs(train, test, save_count):
    def map_atom_info(df, df_dm, atom_idx):
        df = pd.merge(df, df_dm, how = 'left',
                      left_on  = ['molecule_name'],
                      right_on = ['molecule_name'])

        return df

    df_champs_train = pd.read_csv('./input/generated/champs_train.csv')
    train = map_atom_info(train, df_champs_train, 0)
    del df_champs_train
    
    print(df_champs_train.head())
    print(train.head())
    
    #save_checkpoint(f'{save_count}_mulliken-charges/', train, test)
    # save_count += 1
# champs(train, test)

## Generate New Dataset

In [11]:
train, test, save_count = distance_matrix(train, test, save_count)
train, test, save_count = dipole_moments(train, test, save_count)
train, test, save_count = mulliken_charges(train, test, save_count)

# champs(train, test, save_count)

print(train.dtypes)
print(test.dtypes)

Distance matrix added to dataset.
Dipole moments added to dataset.
Mulliken charges added to dataset.


MemoryError: 