ある匂い物質の化学的情報を入力として、おの物質の匂いのスコアを予測する。
匂いは146種類あるので、目的変数を自動で変更し、すべての匂いについて決定係数の値を算出する。説明変数は、すべての分子記述すデータを結合したデータとする。

In [None]:
from google.colab import drive
drive.mount('/content/drive')
# フォルダの移動には"%cd"を使用します。
# 作業フォルダへ移動
%cd /content/drive/'My Drive'/'情報管理'/'卒論'
# 現在のフォルダの中身を表示
%ls

Mounted at /content/drive
/content/drive/My Drive/情報管理/卒論
aaa.ipynb               my_dataframe.xlsx    Untitled5.ipynb
aaa.xlsx                random_forest.ipynb  xgboost.ipynb
lgbm_tuner_cv.ipynb     tunerCV.ipynb        全データ個別ipynb
lgbmチューニング.ipynb  Untitled0.ipynb      全データ自動.ipynb
lgmb.ipynb              Untitled1.ipynb      卒論最新.ipynb
light_alldata.ipynb     Untitled2.ipynb      各データ特徴量.ipynb
lightgbm_mo3.ipynb      Untitled3.ipynb      学習曲線ipynb.ipynb


In [None]:
import os
import sys
import argparse
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

In [None]:
#説明変数に用いる
df_maccskeys = pd.read_csv('maccskeys.csv', sep='\t')
df_maccskeys=df_maccskeys.add_prefix('ma_')
df_mordred_desc = pd.read_csv('mordred_desc.csv', sep='\t')
df_mordred_desc=df_mordred_desc.add_prefix('mord_')
df_morganfp2 = pd.read_csv('morganfp2.csv', sep='\t')
df_morganfp2=df_morganfp2.add_prefix('m2_')
df_morganfp3 = pd.read_csv('morganfp3.csv', sep='\t')
df_morganfp3=df_morganfp3.add_prefix('m3_')
df_rdkit = pd.read_csv('rdkit_desc.csv', sep='\t')
df_rdkit=df_rdkit.add_prefix('rd_')

#目的変数に用いる
df_atlas = pd.read_csv('ATLAS_odor_character_profiles.csv', sep='\t')

df_maccskeys = df_maccskeys.rename(columns={'ma_CAS': 'CAS', 'ma_Molecule': 'Molecule'})
df_mordred_desc = df_mordred_desc.rename(columns={'mord_CAS': 'CAS', 'mord_Molecule': 'Molecule'})
df_morganfp2 = df_morganfp2.rename(columns={'m2_CAS': 'CAS', 'm2_Molecule': 'Molecule'})
df_morganfp3 = df_morganfp3.rename(columns={'m3_CAS': 'CAS', 'm3_Molecule': 'Molecule'})
df_rdkit = df_rdkit.rename(columns={'rd_CAS': 'CAS', 'rd_Molecule': 'Molecule'})

In [None]:
data1 = pd.merge(df_rdkit.dropna(axis=1), 
df_maccskeys.dropna(axis=1), on=['CAS','Molecule'],how='left')

data2 = pd.merge(df_mordred_desc.dropna(axis=1), 
data1.dropna(axis=1), on=['CAS','Molecule'],how='left')

data3 = pd.merge(df_morganfp2.dropna(axis=1), 
data2.dropna(axis=1), on=['CAS','Molecule'],how='left')

data4 = pd.merge(df_morganfp3.dropna(axis=1), 
data3.dropna(axis=1), on=['CAS','Molecule'],how='left')



##LightGBMで回帰を行う

In [None]:
# k分割交差検証を指定し、インスタンス化
from sklearn.model_selection import KFold
kf = KFold(n_splits=5, shuffle=True, random_state=0)

import lightgbm as lgb


# スコアとモデルを格納するリスト
r2_list = []
models = []
r2s = {}

df_atlas1=df_atlas.iloc[:, 0:152]

targets = list(df_atlas1.drop(["ID",	"Molecule",	"NAME",	"CAS",	"CID",	"Mixture"],axis =1).columns)

In [None]:
lgbm_params = {
          'task': 'train',              # タスクを訓練に設定
          'boosting_type': 'gbdt',      # GBDTを指定
          'objective': 'regression',    # 回帰を指定
          'metric': 'rmse',             # 回帰の損失（誤差）
          'learning_rate': 0.1,         # 学習率                # シード値
          }

In [None]:
for target in targets:
    data =df_atlas1.loc[:, ['Molecule', 'CAS',target]]
    data = pd.merge(data, data4.dropna(axis=1), on=['CAS','Molecule'],how='right')
    X = data.drop(['Molecule','CAS',target], axis=1)
    feature_list = list(X.columns)
    # 目的変数（匂い記述子 1つ）
    y = data[target]
    #scaler = StandardScaler()
    #X = scaler.fit_transform(X)
    cv =KFold(n_splits=5,random_state=0,shuffle = True)
    # X = X.values
    r2_list = []
    mae_list = []
    rmse_list = []

    for train_index,test_index in cv.split(X):
        X_train,X_test = X.iloc[train_index],X.iloc[test_index]
        y_train,y_test= y[train_index],y[test_index]
        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_test = lgb.Dataset(X_test, y_test, reference=lgb_train)
        # print(X_train)
        # モデルの作成
        lgb_results = {}    

        model = lgb.train(
                        lgbm_params,                    # ハイパーパラメータをセット
                        lgb_train,              # 訓練データを訓練用にセット
                        valid_sets=[lgb_train, lgb_test], # 訓練データとテストデータをセット
                        valid_names=['Train', 'Test'],    # データセットの名前をそれぞれ設定
                        num_boost_round=100,              # 計算回数
                        early_stopping_rounds=50,         # アーリーストッピング設定
                        evals_result=lgb_results,
                        verbose_eval=-1,                  # ログを最後の1つだけ表示
                        )  
        
        
        y_pred = model.predict(X_test, num_iteration=model.best_iteration)
        
        

        # 評価
        correlation = np.corrcoef(y_test,y_pred)
        r2 = correlation[0,1]*correlation[0,1]
        
        
        r2_list.append(r2)
        
    

    
    r2_mean = np.mean(r2_list)
   

    r2s[target] = np.mean(r2_list)
print('r2',r2s)


##決定係数の値が大きい順に表示する

In [None]:
r2_all=r2s
df_all = pd.DataFrame.from_dict(r2_all,orient="index",columns=["R2"])
df_all = df_all.sort_values(by="R2", ascending=False)
df_all

Unnamed: 0,R2
40_CEDARWOOD,0.540037
132_GARLIC_ONION,0.534023
131_SULFIDIC,0.502740
130_HOUSEHOLD_GAS,0.496690
86_PEANUT_BUTTER,0.473848
...,...
34_BLACK_PEPPER,0.038536
92_FRESH_TOBACCO_SMOKE,0.037140
88_EGGY_(FRESH_EGGS),0.034784
36_DILL,0.018929


In [None]:
df_all.head(15)

Unnamed: 0,R2
40_CEDARWOOD,0.540037
132_GARLIC_ONION,0.534023
131_SULFIDIC,0.50274
130_HOUSEHOLD_GAS,0.49669
86_PEANUT_BUTTER,0.473848
140_SICKENING,0.444652
10_PEAR,0.442798
6_PINEAPPLE,0.434856
144_HEAVY,0.425145
127_RANCID,0.42204
