## Load Everything

In [1]:
%matplotlib inline
# 导入一些库
import pandas as pd
import numpy as np
import math
import gc
import copy
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
import seaborn as sns
from lightgbm import LGBMRegressor

In [2]:
DATA_PATH = '../input'
SUBMISSIONS_PATH = './'
# 使用原子的序号来对表示这些原子
ATOMIC_NUMBERS = {
    'H': 1,
    'C': 6,
    'N': 7,
    'O': 8,
    'F': 9
}

In [3]:
# 导入库
import os
os.listdir(DATA_PATH)

['champs-scalar-coupling', 'quantum-machine-9-qm9']

## Load Dataset

In [5]:
# 训练集的数据类型设置
train_dtypes = {
    'molecule_name': 'category',
    'atom_index_0': 'int8',
    'atom_index_1': 'int8',
    'type': 'category',
    'scalar_coupling_constant': 'float32'
}
# 读取训练集文件
train_csv = pd.read_csv(f'{DATA_PATH}/champs-scalar-coupling/train.csv', index_col='id', dtype=train_dtypes)
# 将molecule_name的格式从dsgdb9nsd_xx改成xx方面处理
train_csv['molecule_index'] = train_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
train_csv = train_csv[['molecule_index', 'atom_index_0', 'atom_index_1', 'type', 'scalar_coupling_constant']]
#打印前10个元素
train_csv.head(10)


  mask |= (ar1 == a)


Unnamed: 0_level_0,molecule_index,atom_index_0,atom_index_1,type,scalar_coupling_constant
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,1,0,1JHC,84.807602
1,1,1,2,2JHH,-11.257
2,1,1,3,2JHH,-11.2548
3,1,1,4,2JHH,-11.2543
4,1,2,0,1JHC,84.807404
5,1,2,3,2JHH,-11.2541
6,1,2,4,2JHH,-11.2548
7,1,3,0,1JHC,84.809303
8,1,3,4,2JHH,-11.2543
9,1,4,0,1JHC,84.809502


In [6]:
# 读取需要提交的文件
submission_csv = pd.read_csv(f'{DATA_PATH}/champs-scalar-coupling/sample_submission.csv', index_col='id')

In [7]:
# 读取测试文件
test_csv = pd.read_csv(f'{DATA_PATH}/champs-scalar-coupling/test.csv', index_col='id', dtype=train_dtypes)
test_csv['molecule_index'] = test_csv['molecule_name'].str.replace('dsgdb9nsd_', '').astype('int32')
test_csv = test_csv[['molecule_index', 'atom_index_0', 'atom_index_1', 'type']]
test_csv.head(10)

Unnamed: 0_level_0,molecule_index,atom_index_0,atom_index_1,type
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4658147,4,2,0,2JHC
4658148,4,2,1,1JHC
4658149,4,2,3,3JHH
4658150,4,3,0,1JHC
4658151,4,3,1,2JHC
4658152,15,3,0,1JHC
4658153,15,3,2,3JHC
4658154,15,3,4,2JHH
4658155,15,3,5,2JHH
4658156,15,4,0,1JHC


In [8]:
# 结构数据类型
structures_dtypes = {
    'molecule_name': 'category',
    'atom_index': 'int8',
    'atom': 'category',
    'x': 'float32',
    'y': 'float32',
    'z': 'float32'
}
structures_csv = pd.read_csv(f'{DATA_PATH}/champs-scalar-coupling/structures.csv', dtype=structures_dtypes)
structures_csv['molecule_index'] = structures_csv.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
structures_csv = structures_csv[['molecule_index', 'atom_index', 'atom', 'x', 'y', 'z']]
structures_csv['atom'] = structures_csv['atom'].replace(ATOMIC_NUMBERS).astype('int8')
structures_csv.head(10)

Unnamed: 0,molecule_index,atom_index,atom,x,y,z
0,1,0,6,-0.012698,1.085804,0.008001
1,1,1,1,0.00215,-0.006031,0.001976
2,1,2,1,1.011731,1.463751,0.000277
3,1,3,1,-0.540815,1.447527,-0.876644
4,1,4,1,-0.523814,1.437933,0.906397
5,2,0,7,-0.040426,1.024108,0.062564
6,2,1,1,0.017257,0.012545,-0.027377
7,2,2,1,0.915789,1.358745,-0.028758
8,2,3,1,-0.520278,1.343532,-0.775543
9,3,0,8,-0.03436,0.97754,0.007602


In [9]:
# 降低内存使用，感觉每个变量的取值范围将其动态改变类型
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [10]:
# 将Categorical类型的数据变成热编码数据
def dummies(df, list_cols):
    for col in list_cols:
        df_dummies = pd.get_dummies(df[col], drop_first=True, 
                                    prefix=(str(col)))
        df = pd.concat([df, df_dummies], axis=1)
    return df

# 添加QM9特征
def add_qm9_features(df):
    # 读取qm9数据
    data_qm9 = pd.read_pickle('../input/quantum-machine-9-qm9/data.covs.pickle')
    # 抛去一些无用的和重复的特征
    to_drop = ['type', 
               'linear', 
               'atom_index_0', 
               'atom_index_1', 
               'scalar_coupling_constant', 
               'U', 'G', 'H', 
               'mulliken_mean', 'r2', 'U0']
    data_qm9 = data_qm9.drop(columns = to_drop, axis=1)
    # 减少内存
    data_qm9 = reduce_mem_usage(data_qm9,verbose=False)
    # 将molecule_index改成
    data_qm9['molecule_index'] = data_qm9.molecule_name.str.replace('dsgdb9nsd_', '').astype('int32')
    data_qm9=data_qm9.drop(columns=['molecule_name'])
    # 将qm9特征加入到df里面去
    df = pd.merge(df, data_qm9, how='left', on=['molecule_index','id'])
    # 抛去molecule_index,id这个对我们预测没啥帮助，此时df已经是最后的训练集或者测试集，特征工程已经处理结束
    df=df.drop(columns=['molecule_index','id'])
    del data_qm9
    gc.collect()
    return df


## Build Distance Dataset

In [11]:
# 根据耦合键的类型，提取特定的数据
def build_type_dataframes(base, structures, coupling_type):
    base = base[base['type'] == coupling_type].drop('type', axis=1).copy()
    base = base.reset_index()
    base['id'] = base['id'].astype('int32')
    structures = structures[structures['molecule_index'].isin(base['molecule_index'])]
    return base, structures

In [12]:
# 从structure根据molecule_index和atom_index来得到得到atom的坐标
def add_coordinates(base, structures, index):
    df = pd.merge(base, structures, how='inner',
                  left_on=['molecule_index', f'atom_index_{index}'],
                  right_on=['molecule_index', 'atom_index']).drop(['atom_index'], axis=1)
    df = df.rename(columns={
        'atom': f'atom_{index}',
        'x': f'x_{index}',
        'y': f'y_{index}',
        'z': f'z_{index}'
    })
    return df

In [13]:
# 添加原子的信息
def add_atoms(base, atoms):
    df = pd.merge(base, atoms, how='inner',
                  on=['molecule_index', 'atom_index_0', 'atom_index_1'])
    return df

In [14]:
# 除了原有的atom_index_0和atom_index_1的那些行不加进去，其它的都加进去
def merge_all_atoms(base, structures):
    df = pd.merge(base, structures, how='left',
                  left_on=['molecule_index'],
                  right_on=['molecule_index'])
    df = df[(df.atom_index_0 != df.atom_index) & (df.atom_index_1 != df.atom_index)]
    return df

In [15]:
# 得到中心点的坐标
def add_center(df):
    df['x_c'] = ((df['x_1'] + df['x_0']) * np.float32(0.5))
    df['y_c'] = ((df['y_1'] + df['y_0']) * np.float32(0.5))
    df['z_c'] = ((df['z_1'] + df['z_0']) * np.float32(0.5))

# 得到到中心点的距离
def add_distance_to_center(df):
    df['d_c'] = ((
        (df['x_c'] - df['x'])**np.float32(2) +
        (df['y_c'] - df['y'])**np.float32(2) + 
        (df['z_c'] - df['z'])**np.float32(2)
    )**np.float32(0.5))

# 计算下标suffix1,和suffix2之间距离
def add_distance_between(df, suffix1, suffix2):
    df[f'd_{suffix1}_{suffix2}'] = ((
        (df[f'x_{suffix1}'] - df[f'x_{suffix2}'])**np.float32(2) +
        (df[f'y_{suffix1}'] - df[f'y_{suffix2}'])**np.float32(2) + 
        (df[f'z_{suffix1}'] - df[f'z_{suffix2}'])**np.float32(2)
    )**np.float32(0.5))

In [16]:
# 计算各个原子间的距离
def add_distances(df):
    n_atoms = 1 + max([int(c.split('_')[1]) for c in df.columns if c.startswith('x_')])
    for i in range(1, n_atoms):
        for vi in range(min(4, i)):
            add_distance_between(df, i, vi)

In [17]:
# 增加一个特征，该特征为molecule_index的分子对应原子的个数
def add_n_atoms(base, structures):
    dfs = structures['molecule_index'].value_counts().rename('n_atoms').to_frame()
    return pd.merge(base, dfs, left_on='molecule_index', right_index=True)

In [18]:
# 根据耦合类型来提取特定的数据，进行小测试
coupling_type='1JHC'
base, structures = build_type_dataframes(train_csv, structures_csv, coupling_type)

In [19]:
# base的头部
base.head()

Unnamed: 0,id,molecule_index,atom_index_0,atom_index_1,scalar_coupling_constant
0,0,1,1,0,84.807602
1,4,1,2,0,84.807404
2,7,1,3,0,84.809303
3,9,1,4,0,84.809502
4,17,5,2,0,171.220001


In [20]:
# stuctures的头部
structures.head()

Unnamed: 0,molecule_index,atom_index,atom,x,y,z
0,1,0,6,-0.012698,1.085804,0.008001
1,1,1,1,0.00215,-0.006031,0.001976
2,1,2,1,1.011731,1.463751,0.000277
3,1,3,1,-0.540815,1.447527,-0.876644
4,1,4,1,-0.523814,1.437933,0.906397


In [21]:
# 添加第一个原子的坐标
base = add_coordinates(base, structures, 0)
base.head()

Unnamed: 0,id,molecule_index,atom_index_0,atom_index_1,scalar_coupling_constant,atom_0,x_0,y_0,z_0
0,0,1,1,0,84.807602,1,0.00215,-0.006031,0.001976
1,4,1,2,0,84.807404,1,1.011731,1.463751,0.000277
2,7,1,3,0,84.809303,1,-0.540815,1.447527,-0.876644
3,9,1,4,0,84.809502,1,-0.523814,1.437933,0.906397
4,17,5,2,0,171.220001,1,-0.027803,2.198949,0.014154


In [22]:
# 添加第二个原子的坐标
base = add_coordinates(base, structures, 1)
base.head()

Unnamed: 0,id,molecule_index,atom_index_0,atom_index_1,scalar_coupling_constant,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1
0,0,1,1,0,84.807602,1,0.00215,-0.006031,0.001976,6,-0.012698,1.085804,0.008001
1,4,1,2,0,84.807404,1,1.011731,1.463751,0.000277,6,-0.012698,1.085804,0.008001
2,7,1,3,0,84.809303,1,-0.540815,1.447527,-0.876644,6,-0.012698,1.085804,0.008001
3,9,1,4,0,84.809502,1,-0.523814,1.437933,0.906397,6,-0.012698,1.085804,0.008001
4,17,5,2,0,171.220001,1,-0.027803,2.198949,0.014154,6,-0.013324,1.132466,0.008276


In [23]:
# 构建需要跑的数据集
def build_couple_dataframe(some_csv, structures_csv, coupling_type, n_atoms=10):
    # 得到base，structures
    base, structures = build_type_dataframes(some_csv, structures_csv, coupling_type)
    # 添加原子1，原子2的坐标
    base = add_coordinates(base, structures, 0)
    base = add_coordinates(base, structures, 1)
    # 扔掉原子1，2的序号的两列
    base = base.drop(['atom_0', 'atom_1'], axis=1)
    #  扔掉id这一列
    atoms = base.drop('id', axis=1).copy()
    
    # 如果有scalar_coupling_constant这一列，则丢掉，scalar_coupling_constant这列是y的值
    if 'scalar_coupling_constant' in some_csv:
        atoms = atoms.drop(['scalar_coupling_constant'], axis=1)
        
    # 添加中心点
    add_center(atoms)
    
    # 删掉原子1，原子2的坐标，现在用中心点来替代
    atoms = atoms.drop(['x_0', 'y_0', 'z_0', 'x_1', 'y_1', 'z_1'], axis=1)

    # 合并所有的原子
    atoms = merge_all_atoms(atoms, structures)
    
    # 对所有的原子添加到中心的距离
    add_distance_to_center(atoms)
    
    # 删除中心点位置
    atoms = atoms.drop(['x_c', 'y_c', 'z_c', 'atom_index'], axis=1)
    
    # 按照molecule_index,atom_index_0,atom_index_1,d_c来对atoms进行排序
    atoms.sort_values(['molecule_index', 'atom_index_0', 'atom_index_1', 'd_c'], inplace=True)
    
    # 提取原子小于n_atoms的分子
    atom_groups = atoms.groupby(['molecule_index', 'atom_index_0', 'atom_index_1'])
    atoms['num'] = atom_groups.cumcount() + 2
    atoms = atoms.drop(['d_c'], axis=1)
    atoms = atoms[atoms['num'] < n_atoms]

    # 对索引设置并通过molecule_index展开
    atoms = atoms.set_index(['molecule_index', 'atom_index_0', 'atom_index_1', 'num']).unstack()
    atoms.columns = [f'{col[0]}_{col[1]}' for col in atoms.columns]
    atoms = atoms.reset_index()
    
    # 转回int8的类型
    for col in atoms.columns:
        if col.startswith('atom_'):
            atoms[col] = atoms[col].fillna(0).astype('int8')
            
    # 转类型
    atoms['molecule_index'] = atoms['molecule_index'].astype('int32')
    
    # 添加原子信息
    full = add_atoms(base, atoms)
    
    # 添加距离
    add_distances(full)
    
    # 根据id来进行重新排序
    full.sort_values('id', inplace=True)
    
    return full

In [24]:
# 生成需要使用的label
def take_n_atoms(df, n_atoms, four_start=4):
    labels = []
    for i in range(2, n_atoms):
        label = f'atom_{i}'
        labels.append(label)

    for i in range(n_atoms):
        num = min(i, 4) if i < four_start else 4
        for j in range(num):
            labels.append(f'd_{i}_{j}')
    if 'scalar_coupling_constant' in df:
        labels.append('scalar_coupling_constant')
    labels=['id','molecule_index']+labels
    return df[labels]

## Check XGBOOST with the smallest type

We don't calculate distances for `d_0_x`, `d_1_1`, `d_2_2`, `d_2_3`, `d_3_3` because we already have them in later atoms(`d_0_1` == `d_1_0`) or they are equal to zeros(e.g. `d_1_1`, `d_2_2`).

## Submission Model

In [25]:
# 生成x,y数据
def build_x_y_data(some_csv, coupling_type, n_atoms):
    full = build_couple_dataframe(some_csv, structures_csv, coupling_type, n_atoms=n_atoms)
    df = take_n_atoms(full, n_atoms)
    df = df.fillna(0)
    df=add_qm9_features(df)
    print(df.columns)
    
    if 'scalar_coupling_constant' in df:
        X_data = df.drop(['scalar_coupling_constant'], axis=1).values.astype('float32')
        y_data = df['scalar_coupling_constant'].values.astype('float32')
    else:
        X_data = df.values.astype('float32')
        y_data = None
    
    return X_data, y_data

In [26]:
# 拿出1JHN进行一个测试
tx,ty=build_x_y_data(train_csv, '1JHN', 7)

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'd_1_0', 'd_2_0',
       'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0', 'd_4_1', 'd_4_2', 'd_4_3',
       'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0', 'd_6_1', 'd_6_2', 'd_6_3',
       'scalar_coupling_constant', 'rc_A', 'rc_B', 'rc_C', 'mu', 'alpha',
       'homo', 'lumo', 'gap', 'zpve', 'Cv', 'freqs_min', 'freqs_max',
       'freqs_mean', 'mulliken_min', 'mulliken_max', 'mulliken_atom_0',
       'mulliken_atom_1'],
      dtype='object')


In [27]:
import xgboost as xgb

In [28]:
# xgb的参数
xgb_params = {'eta': 0.1,
              'max_depth': 11,
              'subsample': 0.9,
              'objective': 'reg:linear',
              'eval_metric': 'mae',
              'tree_method':'gpu_hist',
              'colsample_bytree':1,
              'alpha':0.2,
              'lambda':0.1,
             }

In [29]:
def train_and_predict_for_one_coupling_type(coupling_type, submission,oof_sub,n_atoms, n_folds=5,n_depth=9, n_splits=5, random_state=129):
    print(f'*** Training Model for {coupling_type} ***')
    # 训练集
    X_data, y_data = build_x_y_data(train_csv, coupling_type, n_atoms)
    # 测试集
    X_test, _ = build_x_y_data(test_csv, coupling_type, n_atoms)
    # 需要提交的数据
    y_pred = np.zeros(X_test.shape[0], dtype='float32')
    # oof本来使用了stacking的，后来没用上
    oof = np.zeros(X_data.shape[0],dtype='float32')
    # 交叉验证的分数
    cv_score = 0
    # K折交叉验证
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    
    for fold, (train_index, val_index) in enumerate(kfold.split(X_data, y_data)):
        X_train, X_val = X_data[train_index], X_data[val_index]
        y_train, y_val = y_data[train_index], y_data[val_index]
        # 转化为DMatrix这样xgboost能用
        train_data=xgb.DMatrix(data=X_train,label=y_train)
        vaild_data=xgb.DMatrix(data=X_val,label=y_val)
        xgb_params['max_depth']=n_depth
        # 设置观测的指标
        watchlist = [(vaild_data, 'valid_data')]
        
        # iterations最多迭代次数，每个类别不一样
        if coupling_type in ['1JHN']:
            iterations=5000
        if coupling_type == '1JHC':
            iterations=6000
        else:
            iterations=4700
            
        # xgb开始训练
        model = xgb.train(dtrain=train_data, num_boost_round=iterations, evals=watchlist, early_stopping_rounds=250,params=xgb_params,verbose_eval=500)
        # xgb对验证集的预测结果
        y_val_pred=model.predict(xgb.DMatrix(X_val))
        # 得到验证集的分数
        val_score = np.log(mean_absolute_error(y_val, y_val_pred))
        oof[val_index]=y_val_pred
        print(f'{coupling_type} Fold {fold}, logMAE: {val_score}')
        cv_score += val_score / n_folds
        # xgb对测试集的预测结果
        y_pred += model.predict(xgb.DMatrix(X_test)) / n_folds

        
    submission.loc[test_csv['type'] == coupling_type, 'scalar_coupling_constant'] = y_pred
    oof_sub.loc[train_csv['type'] == coupling_type, 'scalar_coupling_constant'] = oof
    return cv_score


In [30]:
# 模型的n_atoms个数设置
model_params = {
    '1JHN': 10,
    '1JHC': 13,
    '2JHH': 10,
    '2JHN': 10,
    '2JHC': 10,
    '3JHH': 10,
    '3JHC': 10,
    '3JHN': 10
}
# 模型的深度设置
model_depth=\
{
    '1JHN': 11,
    '1JHC': 11,
    '2JHH': 11,
    '2JHN': 11,
    '2JHC': 11,
    '3JHH': 11,
    '3JHC': 11,
    '3JHN': 11
}
# 7折交叉验证
N_FOLDS = 7
# 
submission = submission_csv.copy()
oof_submission=train_csv[['type','scalar_coupling_constant']].copy()
oof_submission['scalar_coupling_constant']=0

# 对于每种类型都进行训练
cv_scores = {}
for coupling_type in model_params.keys():
    cv_score = train_and_predict_for_one_coupling_type(
        coupling_type, submission,oof_submission,n_atoms=model_params[coupling_type], n_folds=N_FOLDS,n_depth=model_depth[coupling_type],random_state=130)
    cv_scores[coupling_type] = cv_score

*** Training Model for 1JHN ***
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0',
       'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0',
       'd_8_1', 'd_8_2', 'd_8_3', 'd_9_0', 'd_9_1', 'd_9_2', 'd_9_3',
       'scalar_coupling_constant', 'rc_A', 'rc_B', 'rc_C', 'mu', 'alpha',
       'homo', 'lumo', 'gap', 'zpve', 'Cv', 'freqs_min', 'freqs_max',
       'freqs_mean', 'mulliken_min', 'mulliken_max', 'mulliken_atom_0',
       'mulliken_atom_1'],
      dtype='object')
Index(['atom_2', 'atom_3', 'atom_4', 'atom_5', 'atom_6', 'atom_7', 'atom_8',
       'atom_9', 'd_1_0', 'd_2_0', 'd_2_1', 'd_3_0', 'd_3_1', 'd_3_2', 'd_4_0',
       'd_4_1', 'd_4_2', 'd_4_3', 'd_5_0', 'd_5_1', 'd_5_2', 'd_5_3', 'd_6_0',
       'd_6_1', 'd_6_2', 'd_6_3', 'd_7_0', 'd_7_1', 'd_7_2', 'd_7_3', 'd_8_0',
   

In [31]:
oof_submission.head()
oof_submission.to_csv('xgb_oof.csv',index=False)

In [32]:
train_csv.head()

Unnamed: 0_level_0,molecule_index,atom_index_0,atom_index_1,type,scalar_coupling_constant
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1,1,0,1JHC,84.807602
1,1,1,2,2JHH,-11.257
2,1,1,3,2JHH,-11.2548
3,1,1,4,2JHH,-11.2543
4,1,2,0,1JHC,84.807404


In [33]:
submission_csv.head()

Unnamed: 0_level_0,scalar_coupling_constant
id,Unnamed: 1_level_1
4658147,0
4658148,0
4658149,0
4658150,0
4658151,0


In [34]:
pd.DataFrame({'type': list(cv_scores.keys()), 'cv_score': list(cv_scores.values())})

Unnamed: 0,type,cv_score
0,1JHN,-1.154808
1,1JHC,-0.560578
2,2JHH,-1.989553
3,2JHN,-2.04269
4,2JHC,-1.474165
5,3JHH,-1.902312
6,3JHC,-1.416029
7,3JHN,-2.266773


In [35]:
# 打印交叉验证的分数
np.mean(list(cv_scores.values()))

-1.6008634098938535

In [37]:
submission.head(10)

Unnamed: 0_level_0,scalar_coupling_constant
id,Unnamed: 1_level_1
4658147,22.284721
4658148,122.587364
4658149,5.333972
4658150,122.587364
4658151,22.284721
4658152,91.97184
4658153,2.361273
4658154,-7.788785
4658155,-9.673582
4658156,91.971405


In [38]:
# 提交最终结果
submission.to_csv(f'{SUBMISSIONS_PATH}/submission.csv')

## Room for improvement