In [1]:
# Python : 3.6.13
# Numpy: 1.17.0
# pandas: 0.25.0
# matplotlib: 3.1.1
# scipy: 1.3.1
# scikit-learn: 0.20.0

import os, sys, glob
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib as mpl
#mpl.use('TKAgg',warn=False, force=True) #set MPL backend.
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pickle #save/load python data objects (dictionaries/arrays)
import time
import itertools
from textwrap import wrap #Long figure titles
import multiprocessing
#from memory_profiler import profile #profile memory
from astropy.coordinates import SkyCoord
import astropy.units as u
import re
from tqdm import tqdm

#ML libraries
from sklearn.model_selection import train_test_split
from sklearn.model_selection import learning_curve
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn import manifold
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

## 使用する関数

In [2]:
# ファイルのLoading/saving
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
# デバック用
    with open(name + '.pkl', 'rb') as f:
# ファイル実行用
    # with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)
#------------------------------------------------------------------------------------------------------------ 
    
    

    
    
#------------------------------------------------------------------------------------------------------------ 

# データの取得と整理

#------------------------------------------------------------------------------------------------------------ 



# HSC天体とSDSS天体をマッチングさせ、 HSC_df と HSC_classified_df に分類する
def separate_sources(HSC_file, SDSS_df): #HSC_file : ファイル SDSS_df : dataframe
    
    #HSCのデータを読み込む
    HSC_path = 'HSC_sources/'+ HSC_file
    HSC_df = pd.read_csv(
        HSC_path, compression="gzip", header=0, sep=",", quotechar='"'
    )
    
    if len(HSC_df) != 0:
        #HSCとSDSSのデータのraとdecを取得し、単位を入れる
        HSC_coords = SkyCoord(ra=HSC_df['ra'].to_numpy()*u.degree, dec=HSC_df['dec'].to_numpy()*u.degree)
        SDSS_coords = SkyCoord(ra=SDSS_df['ra'].to_numpy()*u.degree, dec=SDSS_df['dec'].to_numpy()*u.degree)

        #idx:インデックス d2d:天球上の距離 d3d:3次元距離
        idx, d2d, d3d= SDSS_coords.match_to_catalog_sky(HSC_coords)

        #最大距離制約を設ける
        max_sep = 1.0 * u.arcsec
        sep_constraint = d2d < max_sep

        #マッチングした天体
        SDSS_matches = SDSS_df[sep_constraint]
        HSC_matches = HSC_df.iloc[idx[sep_constraint]]

        #↓マッチングしてない天体を分類される天体HSC_dfに入れる↓
        # マッチングした天体のindexをリストに変換する
        HSC_matches_index = HSC_matches.index.tolist()

        #HSC_dfからHSC_matches_index　のデータを削除したHSC_newを作成
        HSC_df = HSC_df.drop(HSC_matches_index, axis=0)

        #↓マッチングしたデータのSDSSのclassをHSCのデータに付け加えHSC_classifiedとする↓
        #それぞれのindexをリセット
        HSC_matches.reset_index(drop=True, inplace=True)
        SDSS_matches.reset_index(drop=True, inplace=True)

        #HSC_Classified　にclassを追加
        HSC_classified_df = pd.merge(HSC_matches, SDSS_matches[["class"]], left_index=True, right_index=True)
    else:
        HSC_classified_df = HSC_df
    
    return HSC_df, HSC_classified_df
#------------------------------------------------------------------------------------------------------------


# separate関数を使って分類されたデータ(HSC_classified_df)とこれから分類するデータ(HSC_new_df)に分ける
def make_df_and_df_new(HSC_files, SDSS_df):
    
    # dataframeの作成
    columns = ['# object_id', 'ra', 'dec', 'g_cmodel_mag', 'g_cmodel_magerr',
       'r_cmodel_mag', 'r_cmodel_magerr', 'i_cmodel_mag', 'i_cmodel_magerr',
       'z_cmodel_mag', 'z_cmodel_magerr', 'y_cmodel_mag', 'y_cmodel_magerr',
       'g_pixelflags_saturatedcenter', 'r_pixelflags_saturatedcenter',
       'i_pixelflags_saturatedcenter', 'z_pixelflags_saturatedcenter',
       'y_pixelflags_saturatedcenter', 'g_pixelflags_edge',
       'r_pixelflags_edge', 'i_pixelflags_edge', 'z_pixelflags_edge',
       'y_pixelflags_edge', 'g_pixelflags_bad', 'r_pixelflags_bad',
       'i_pixelflags_bad', 'z_pixelflags_bad', 'y_pixelflags_bad',
       'g_pixelflags_bright_objectcenter', 'r_pixelflags_bright_objectcenter',
       'i_pixelflags_bright_objectcenter', 'z_pixelflags_bright_objectcenter',
       'y_pixelflags_bright_objectcenter', 'g_psfflux_mag', 'g_psfflux_magerr',
       'r_psfflux_mag', 'r_psfflux_magerr', 'i_psfflux_mag',
       'i_psfflux_magerr', 'z_psfflux_mag', 'z_psfflux_magerr',
       'y_psfflux_mag', 'y_psfflux_magerr', 'prob_gal', 'prob_qso',
       'prob_star']

    df = pd.DataFrame(columns=columns)
    df_news = []
    progress = tqdm(total = len(HSC_files), unit='count')
    
    for HSC_file in HSC_files:
        HSC_new_df, HSC_classified_df = separate_sources(HSC_file, SDSS_df)
        
        df = pd.concat([df, HSC_classified_df], axis=0, ignore_index=True, sort=False)
        df_news.append(HSC_new_df)
        progress.update(1)
        
    progress.close()
    
    return df, df_news
#------------------------------------------------------------------------------------------------------------



# 分類したデータの保存
def save_new_sources(df_news): 
    for index, df_new in enumerate(df_news):

        #名前のパターンを取得し、ファイルの名前をnameに入れる
        pattern = re.compile(r'(\d+_\d+_ra_\d+)')
        name = pattern.search(HSC_files[index]).group(1)

        save_obj(df_new, 'HSC_new_sources/'+ name)
#------------------------------------------------------------------------------------------------------------         
        

    
    
        
#------------------------------------------------------------------------------------------------------------ 

# ランダムフォレストの関数

#------------------------------------------------------------------------------------------------------------ 



def prepare_data(df, feature_columns, train_percent=0.5):
    all_features = df[[*feature_columns]]
    all_classes = df['class']
    features_train, features_test, classes_train, classes_test = train_test_split(all_features, all_classes, train_size=train_percent, test_size=(1-train_percent), random_state=0, stratify=all_classes)
    class_names = np.unique(all_classes) #numpy.ndarray
    feature_names = list(all_features) #list
    return {'features_train':features_train, 'features_test':features_test, 'classes_train':classes_train, 'classes_test':classes_test, 'class_names':class_names, 'feature_names':feature_names} #return dictionary. data within dictionary are DataFrames.
#------------------------------------------------------------------------------------------------------------




def RF_fit(df, n_estimators, n_jobs=-1):
    print('Fitting a random forest model to the data...')
    rfc=RandomForestClassifier(n_jobs=n_jobs, n_estimators=n_estimators, random_state=0, class_weight='balanced')
    pipeline = Pipeline([('classification', rfc)]) # sklearn.pipeline.Pipeline
    pipeline.fit(df['features_train'], df['classes_train'])
    return pipeline
#------------------------------------------------------------------------------------------------------------




def RF_classify(pipeline, data, n_jobs=-1, proba=False):
    print('Classifying objects using random forest model...')
    if proba==False:
        classes_pred = pipeline.predict(data['features_test'])
        return classes_pred
    if proba==True:
        classes_pred_proba = pipeline.predict_proba(data['features_test'])
        return classes_pred_proba
#------------------------------------------------------------------------------------------------------------



# trainingの精度を確かめる
def metrics(df, classes_pred_all):
    report=classification_report(df['classes_test'], classes_pred_all, target_names=np.unique(df['class_names']), digits=4, output_dict=True)
    report_df = pd.DataFrame(report).transpose()
    cm = confusion_matrix(df['classes_test'], classes_pred_all, labels=df['class_names']) #confusion matrixの略
    cm_df = pd.DataFrame(cm, index=df['class_names'], columns=df['class_names'])
    return report_df, cm_df
#------------------------------------------------------------------------------------------------------------



def train_vs_f1score(df, place, sampleG=False):
    train_range = [0.001, 0.003, 0.01, 0.06, 0.12, 0.2, 0.4, 0.6, 0.8, 1.0]
    
    f1scores=[]
    precisions=[]
    recalls=[]
    f1scores.append(train_range) # グラフを作る際にx軸を簡単に入力できるようtrain_rangeを0番目に入れる
    print('Looping over these possible train percentages: {0}'.format(train_range))
    # 初めにtrainingセットとtestセットに分ける
    data_prep_dict_all = prepare_data(df, feature_columns, train_percent=0.5)
    # RFのpipelineを設定する
    pipeline = Pipeline([ ('classification', RandomForestClassifier(n_jobs=-1, n_estimators=200, random_state=0, class_weight='balanced')) ])

    # training_rangeの値でdata_prep_dict_allで分けたtrain_dataをtrainし、検証する
    for i in train_range:
        print('train percent is: {0}'.format(i))
        if i!=1.0:
        # data_prep_dict_allのfeatures_trainとclasses_trainをtrainとtestデータに分ける
            features_train, features_test, classes_train, classes_test = train_test_split(data_prep_dict_all['features_train'], data_prep_dict_all['classes_train'], train_size=i, test_size=(1-i), random_state=0, stratify=data_prep_dict_all['classes_train'])
        if i==1.0:
            features_train = data_prep_dict_all['features_train']
            features_test = data_prep_dict_all['features_test']
            classes_train = data_prep_dict_all['classes_train']
            classes_test = data_prep_dict_all['classes_test']

        print('number of sources available for training {0}'.format(len(features_train)))
        if sampleG==True: #glaxiesの16%だけとって精度を確認するやつ
            print('sampling galaxies to fix class imbalance...')
            galaxy_features = features_train[classes_train == 'GALAXY']
            quasar_features = features_train[classes_train == 'QSO']
            star_features = features_train[classes_train == 'STAR']
            galaxy_classes = classes_train[classes_train == 'GALAXY']
            quasar_classes = classes_train[classes_train == 'QSO']
            star_classes = classes_train[classes_train == 'STAR']
            # 16%のglaxyを取るために6スキップでデータをとり、galaxy_features, galaxy_classesに入れる これで訓練用のGが13000天体くらいになる
            galaxy_features = galaxy_features[0::6]
            galaxy_classes = galaxy_classes[0::6]
            # featuresをclassesを組み合わせる、一つの変数にする
            features_train = pd.concat([galaxy_features, quasar_features, star_features])
            classes_train = pd.concat([galaxy_classes, quasar_classes, star_classes])
            print('Training on {0}... G: {1}, Q: {2}, S: {3}'.format(len(features_train), len(galaxy_features), len(quasar_features), len(star_features)))
            # データをシャッフルする。
            p = np.random.permutation(len(features_train))
            features_train = np.array(features_train)[p]
            classes_train = np.array(classes_train)[p]

        if sampleG==False:
            galaxy_classes = classes_train[classes_train == 'GALAXY']
            quasar_classes = classes_train[classes_train == 'QSO']
            star_classes = classes_train[classes_train == 'STAR']
            print('Training on {0}... G: {1}, Q: {2}, S: {3}'.format(len(features_train), len(galaxy_classes), len(quasar_classes), len(star_classes)))

        # train データをrfモデルにフィットさせる
        pipeline.fit(features_train, classes_train)
        # オリジナル(1番初めに分類した)のテストデータを使ってclassを分類し、精度を確認する
        classes_pred = pipeline.predict(data_prep_dict_all['features_test'])
        f1score = f1_score(data_prep_dict_all['classes_test'], classes_pred, average=None) #numpy.ndarray
        precision = precision_score(data_prep_dict_all['classes_test'], classes_pred, average=None) #numpy.ndarray
        recall = recall_score(data_prep_dict_all['classes_test'], classes_pred, average=None) #numpy.ndarray
        print('  f1score  ', f1score) #G , S , Qの順で出力される
        print(' precision ', precision)
        print('    recall    ',recall)
        f1scores.append(f1score)
        precisions.append(precision)
        recalls.append(recall)
        print('-'*30)

    print(f1score)
    
    if sampleG==False:
        save_obj(f1scores, 'HSC_ML_save/' + place +'/train_vs_f1score')
        save_obj(precisions, 'HSC_ML_save/' + place +'/train_vs_precision')
        save_obj(recalls, 'HSC_ML_save/' + place +'/train_vs_recall')
    if sampleG==True:
        save_obj(f1scores, 'HSC_ML_save/' + place +'/train_vs_f1score_sampleG')
        save_obj(precisions, 'HSC_ML_save/' + place +'/train_vs_precision_sampleG')
        save_obj(recalls, 'HSC_ML_save/' + place +'/train_vs_recall_sampleG')
#------------------------------------------------------------------------------------------------------------


## SDSSの分類されたデータとHSCデータを読み込む

In [3]:
# # SDSSデータの読み込み
SDSS_df = load_obj('SDSS_spec_xmwise_all')
SDSS_ra_dec_class = ['ra', 'dec', 'class']
SDSS_df = SDSS_df[SDSS_ra_dec_class].drop_duplicates()  #重複している天体を削除 total : 3099393    G: 2209270    Q :377887   S:512236


# HSCデータの読み込み
HSC_files = ["1_0_ra_20.csv.gz", "2_20_ra_30.csv.gz", "3_30_ra_40.csv.gz","4_40_ra_60.csv.gz","5_60_ra_120.csv.gz","6_120_ra_140.csv.gz","7_140_ra_160.csv.gz","8_160_ra_180.csv.gz","9_180_ra_200.csv.gz","10_200_ra_210.csv.gz","11_210_ra_220.csv.gz","12_220_ra_240.csv.gz","13_240_ra_300.csv.gz","14_300_ra_330.csv.gz","15_330_ra_360.csv.gz"]
df, df_news = make_df_and_df_new(HSC_files, SDSS_df) #長時間かかる

# ラベルを持ってるデータと持ってないデータを保存
save_obj(df, 'HSC_match/matched_source')
save_new_sources(df_news)

# 2回目以降のデータ読み込み
# df = load_obj('HSC_match/matched_source')

## 天体数

In [None]:
#ファイルのラベル数
'''
"1_0_ra_20.csv.gz"        :     7,001,783 rows
"2_20_ra_30.csv.gz"       :     4,008,557 rows 
"3_30_ra_40.csv.gz"       :     7,538,652 rows
"4_40_ra_60.csv.gz"       :        18,150 rows
"5_60_ra_120.csv.gz"      :             0 rows
"6_120_ra_140.csv.gz"     :    2,917,255 rows
"7_140_ra_160.csv.gz"     :    8,144,439 rows
"8_160_ra_180.csv.gz"     :    9,318,738 rows
"9_180_ra_200.csv.gz"     :    9,612,413 rows
"10_200_ra_210.csv.gz"    :    5,536,240 rows
"11_210_ra_220.csv.gz"    :    4,955,656 rows
"12_220_ra_240.csv.gz"    :    4,183,360 rows
"13_240_ra_300.csv.gz"    :    1,030,939 rows
"14_300_ra_330.csv.gz"    :       32,787 rows
"15_330_ra_360.csv.gz"    :    9,307,158 rows

total : 73,606,127
'''


#マッチング数 47301
'''
"1_0_ra_20.csv.gz"        :       28,885 rows
"2_20_ra_30.csv.gz"       :       15,763 rows
"3_30_ra_40.csv.gz"       :       27,413 rows
"4_40_ra_60.csv.gz"       :           85 rows
"5_60_ra_120.csv.gz"      :            0 rows
"6_120_ra_140.csv.gz"     :        5,421 rows
"7_140_ra_160.csv.gz"     :       18,176 rows
"8_160_ra_180.csv.gz"     :       21,380 rows
"9_180_ra_200.csv.gz"     :       20,164 rows
"10_200_ra_210.csv.gz"    :       12,850 rows
"11_210_ra_220.csv.gz"    :       12,869 rows
"12_220_ra_240.csv.gz"    :        9,421 rows
"13_240_ra_300.csv.gz"    :        2,134 rows
"14_300_ra_330.csv.gz"    :          116 rows
"15_330_ra_360.csv.gz"    :       27,263 rows

total : 201,940
'''

## モデルの作成

In [4]:
# パラメータ指定
train_percent = 0.5 # データのtrainingに充てる割合
n_jobs=-1 # 各決定木あたりに使える最大特徴量の数
n_estimators = 200 # 森の中に何本決定木を作るか

place = 'psf_ishape_color' #特定の特徴量を保存する場所

## 様々な特徴のモデルを作成

In [6]:
# # 特徴量[psf , resolved_i, ishape]

# df['resolved_i'] = np.sqrt((df.i_psfflux_mag - df.i_cmodel_mag)**2)
# df = df.dropna(subset=['ishape']) # ishapeの欠損値をなくす  447天体欠損　合計201,493天体

# psf = ['g_psfflux_mag','r_psfflux_mag', 'i_psfflux_mag', 'z_psfflux_mag', 'y_psfflux_mag']
# feature_columns = psf + ['resolved_i'] + ['ishape']
# feature_labels = ['g','r','i','z','y', '$\mathrm{resolved}_\mathrm{i}$ ', '$\mathrm{ishape}$']

# print('features used are:')
# print(df[feature_columns].columns)

# data_prep_dict_all = prepare_data(df, feature_columns, train_percent)
# pipeline = RF_fit(data_prep_dict_all, n_estimators, n_jobs=-1)
# classes_pred_all = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=False) #<class 'numpy.ndarray'>
# classes_pred_all_proba = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=True) #<class 'numpy.ndarray'>
# report_df, cm_df = metrics(data_prep_dict_all, classes_pred_all)

# report_df #違うセルでの実行推奨
# cm_df #違うセルでの実行推奨


# save_obj(pipeline, 'HSC_ML_save/psf_resolved_i_ishape/rf_pipeline') # pipelineのclassificationが入ってる
# save_obj(data_prep_dict_all, 'HSC_ML_save/psf_resolved_i_ishape/data_prep_dict_all') #訓練データとトレーニングデータに分けたもの
# save_obj(classes_pred_all, 'HSC_ML_save/psf_resolved_i_ishape/classes_pred_all') #テストデータをモデルにフィットさせ、返ってきたテストデータのクラス
# save_obj(classes_pred_all_proba,'HSC_ML_save/psf_resolved_i_ishape/classes_pred_all_proba') #テストデータのクラスの分類確率

# df_predclass = pd.DataFrame(classes_pred_all, index=data_prep_dict_all['features_test'].index, columns=['class_pred'])
# df = df.join(df_predclass, how='left')

# df_proba = pd.DataFrame(classes_pred_all_proba, index=data_prep_dict_all['features_test'].index, columns=['prob_g', 'prob_q', 'prob_s'])
# # Append probabilities to the original df for test data:
# df = df.join(df_proba, how='left')
# df['prob_best'] = df[['prob_g', 'prob_q', 'prob_s']].max(axis=1) # max(axis=1)で水平方向の最大を求めることで、prob_bestに、最大値が入る


# save_obj(df, 'HSC_ML_save/psf_resolved_i_ishape/df_spec_classprobs')
# #------------------------------------------------------------------------------------------------------------

In [7]:
# # 特徴量[psf , resolved_i]

# df['resolved_i'] = np.sqrt((df.i_psfflux_mag - df.i_cmodel_mag)**2)
# psf = ['g_psfflux_mag','r_psfflux_mag', 'i_psfflux_mag', 'z_psfflux_mag', 'y_psfflux_mag']
# feature_columns = psf + ['resolved_i']
# feature_labels = ['g','r','i','z','y', '$\mathrm{resolved}_\mathrm{i}$ ']

# print('features used are:')
# print(df[feature_columns].columns)

# data_prep_dict_all = prepare_data(df, feature_columns, train_percent)
# pipeline = RF_fit(data_prep_dict_all, n_estimators, n_jobs=-1)
# classes_pred_all = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=False) #<class 'numpy.ndarray'>
# classes_pred_all_proba = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=True) #<class 'numpy.ndarray'>
# report_df, cm_df = metrics(data_prep_dict_all, classes_pred_all)

# report_df #違うセルでの実行推奨
# cm_df #違うセルでの実行推奨

# save_obj(pipeline, 'HSC_ML_save/psf_resolved_i/rf_pipeline') # pipelineのclassificationが入ってる
# save_obj(data_prep_dict_all, 'HSC_ML_save/psf_resolved_i/data_prep_dict_all') #訓練データとトレーニングデータに分けたもの
# save_obj(classes_pred_all, 'HSC_ML_save/psf_resolved_i/classes_pred_all') #テストデータをモデルにフィットさせ、返ってきたテストデータのクラス
# save_obj(classes_pred_all_proba,'HSC_ML_save/psf_resolved_i/classes_pred_all_proba') #テストデータのクラスの分類確率

# df_predclass = pd.DataFrame(classes_pred_all, index=data_prep_dict_all['features_test'].index, columns=['class_pred'])
# df = df.join(df_predclass, how='left')

# df_proba = pd.DataFrame(classes_pred_all_proba, index=data_prep_dict_all['features_test'].index, columns=['prob_g', 'prob_q', 'prob_s'])
# # Append probabilities to the original df for test data:
# df = df.join(df_proba, how='left')
# df['prob_best'] = df[['prob_g', 'prob_q', 'prob_s']].max(axis=1) # max(axis=1)で水平方向の最大を求めることで、prob_bestに、最大値が入る

# save_obj(df, 'HSC_ML_save/psf_resolved_i/df_spec_classprobs')

In [8]:
# # 特徴量[psf , ishape]

# # ishapeの欠損値をなくす  447天体欠損　合計201,493天体
# df = df.dropna(subset=['ishape'])

# psf = ['g_psfflux_mag','r_psfflux_mag', 'i_psfflux_mag', 'z_psfflux_mag', 'y_psfflux_mag']
# feature_columns = psf + ['ishape']
# feature_labels = ['g','r','i','z','y', '$\mathrm{ishape}$']

# print('features used are:')
# print(df[feature_columns].columns)

# data_prep_dict_all = prepare_data(df, feature_columns, train_percent)
# pipeline = RF_fit(data_prep_dict_all, n_estimators, n_jobs=-1)
# classes_pred_all = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=False) #<class 'numpy.ndarray'>
# classes_pred_all_proba = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=True) #<class 'numpy.ndarray'>
# report_df, cm_df = metrics(data_prep_dict_all, classes_pred_all)

# report_df #違うセルでの実行推奨
# cm_df #違うセルでの実行推奨

# save_obj(pipeline, 'HSC_ML_save/psf_ishape/rf_pipeline') # pipelineのclassificationが入ってる
# save_obj(data_prep_dict_all, 'HSC_ML_save/psf_ishape/data_prep_dict_all') #訓練データとトレーニングデータに分けたもの
# save_obj(classes_pred_all, 'HSC_ML_save/psf_ishape/classes_pred_all') #テストデータをモデルにフィットさせ、返ってきたテストデータのクラス
# save_obj(classes_pred_all_proba,'HSC_ML_save/psf_ishape/classes_pred_all_proba') #テストデータのクラスの分類確率

# df_predclass = pd.DataFrame(classes_pred_all, index=data_prep_dict_all['features_test'].index, columns=['class_pred'])
# df = df.join(df_predclass, how='left')

# df_proba = pd.DataFrame(classes_pred_all_proba, index=data_prep_dict_all['features_test'].index, columns=['prob_g', 'prob_q', 'prob_s'])
# # Append probabilities to the original df for test data:
# df = df.join(df_proba, how='left')
# df['prob_best'] = df[['prob_g', 'prob_q', 'prob_s']].max(axis=1) # max(axis=1)で水平方向の最大を求めることで、prob_bestに、最大値が入る

# save_obj(df, 'HSC_ML_save/psf_ishape/df_spec_classprobs')

In [9]:
# # 特徴量[cmodel , ishape]

# # ishapeの欠損値をなくす  447天体欠損　合計201,493天体
# df = df.dropna(subset=['ishape'])

# # cmodel magnitude
# cmodel = ['g_cmodel_mag', 'r_cmodel_mag', 'i_cmodel_mag', 'z_cmodel_mag', 'y_cmodel_mag']
# feature_columns = cmodel + ['ishape']
# feature_labels = ['g','r','i','z','y', '$\mathrm{ishape}$']

# print('features used are:')
# print(df[feature_columns].columns)

# data_prep_dict_all = prepare_data(df, feature_columns, train_percent)
# pipeline = RF_fit(data_prep_dict_all, n_estimators, n_jobs=-1)
# classes_pred_all = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=False) #<class 'numpy.ndarray'>
# classes_pred_all_proba = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=True) #<class 'numpy.ndarray'>
# report_df, cm_df = metrics(data_prep_dict_all, classes_pred_all)

# report_df #違うセルでの実行推奨
# cm_df #違うセルでの実行推奨

# # psf+resolved_i+ishape
# save_obj(pipeline, 'HSC_ML_save/cmodel_ishape/rf_pipeline') # pipelineのclassificationが入ってる
# save_obj(data_prep_dict_all, 'HSC_ML_save/cmodel_ishape/data_prep_dict_all') #訓練データとトレーニングデータに分けたもの
# save_obj(classes_pred_all, 'HSC_ML_save/cmodel_ishape/classes_pred_all') #テストデータをモデルにフィットさせ、返ってきたテストデータのクラス
# save_obj(classes_pred_all_proba,'HSC_ML_save/cmodel_ishape/classes_pred_all_proba') #テストデータのクラスの分類確率

# df_predclass = pd.DataFrame(classes_pred_all, index=data_prep_dict_all['features_test'].index, columns=['class_pred'])
# df = df.join(df_predclass, how='left')

# df_proba = pd.DataFrame(classes_pred_all_proba, index=data_prep_dict_all['features_test'].index, columns=['prob_g', 'prob_q', 'prob_s'])
# # Append probabilities to the original df for test data:
# df = df.join(df_proba, how='left')
# df['prob_best'] = df[['prob_g', 'prob_q', 'prob_s']].max(axis=1) # max(axis=1)で水平方向の最大を求めることで、prob_bestに、最大値が入る

# save_obj(df, 'HSC_ML_save/cmodel_ishape/df_spec_classprobs')

In [10]:
# # 特徴量[psf]

# psf = ['g_psfflux_mag','r_psfflux_mag', 'i_psfflux_mag', 'z_psfflux_mag', 'y_psfflux_mag']
# feature_columns = psf
# feature_labels = ['g','r','i','z','y']

# print('features used are:')
# print(df[feature_columns].columns)

# data_prep_dict_all = prepare_data(df, feature_columns, train_percent)
# pipeline = RF_fit(data_prep_dict_all, n_estimators, n_jobs=-1)
# classes_pred_all = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=False) #<class 'numpy.ndarray'>
# classes_pred_all_proba = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=True) #<class 'numpy.ndarray'>
# report_df, cm_df = metrics(data_prep_dict_all, classes_pred_all)

# report_df #違うセルでの実行推奨
# cm_df #違うセルでの実行推奨

# save_obj(pipeline, 'HSC_ML_save/psf/rf_pipeline') # pipelineのclassificationが入ってる
# save_obj(data_prep_dict_all, 'HSC_ML_save/psf/data_prep_dict_all') #訓練データとトレーニングデータに分けたもの
# save_obj(classes_pred_all, 'HSC_ML_save/psf/classes_pred_all') #テストデータをモデルにフィットさせ、返ってきたテストデータのクラス
# save_obj(classes_pred_all_proba,'HSC_ML_save/psf/classes_pred_all_proba') #テストデータのクラスの分類確率

# df_predclass = pd.DataFrame(classes_pred_all, index=data_prep_dict_all['features_test'].index, columns=['class_pred'])
# df = df.join(df_predclass, how='left')

# df_proba = pd.DataFrame(classes_pred_all_proba, index=data_prep_dict_all['features_test'].index, columns=['prob_g', 'prob_q', 'prob_s'])
# # Append probabilities to the original df for test data:
# df = df.join(df_proba, how='left')
# df['prob_best'] = df[['prob_g', 'prob_q', 'prob_s']].max(axis=1) # max(axis=1)で水平方向の最大を求めることで、prob_bestに、最大値が入る

# save_obj(df, 'HSC_ML_save/psf/df_spec_classprobs')

In [11]:
# # 特徴量[ishape]

# # ishapeの欠損値をなくす  447天体欠損　合計201,493天体
# df = df.dropna(subset=['ishape'])
# feature_columns = ['ishape']
# feature_labels = ['ishape']

# print('features used are:')
# print(df[feature_columns].columns)

# data_prep_dict_all = prepare_data(df, feature_columns, train_percent)
# pipeline = RF_fit(data_prep_dict_all, n_estimators, n_jobs=-1)
# classes_pred_all = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=False) #<class 'numpy.ndarray'>
# classes_pred_all_proba = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=True) #<class 'numpy.ndarray'>
# report_df, cm_df = metrics(data_prep_dict_all, classes_pred_all)

# report_df #違うセルでの実行推奨
# cm_df #違うセルでの実行推奨

# save_obj(pipeline, 'HSC_ML_save/ishape/rf_pipeline') # pipelineのclassificationが入ってる
# save_obj(data_prep_dict_all, 'HSC_ML_save/ishape/data_prep_dict_all') #訓練データとトレーニングデータに分けたもの
# save_obj(classes_pred_all, 'HSC_ML_save/ishape/classes_pred_all') #テストデータをモデルにフィットさせ、返ってきたテストデータのクラス
# save_obj(classes_pred_all_proba,'HSC_ML_save/ishape/classes_pred_all_proba') #テストデータのクラスの分類確率

# df_predclass = pd.DataFrame(classes_pred_all, index=data_prep_dict_all['features_test'].index, columns=['class_pred'])
# df = df.join(df_predclass, how='left')

# df_proba = pd.DataFrame(classes_pred_all_proba, index=data_prep_dict_all['features_test'].index, columns=['prob_g', 'prob_q', 'prob_s'])
# # Append probabilities to the original df for test data:
# df = df.join(df_proba, how='left')
# df['prob_best'] = df[['prob_g', 'prob_q', 'prob_s']].max(axis=1) # max(axis=1)で水平方向の最大を求めることで、prob_bestに、最大値が入る

# # psf+resolved_i+ishape
# save_obj(df, 'HSC_ML_save/ishape/df_spec_classprobs')

In [5]:
# 特徴量[psf , ishape, color] 1番精度が高かった

# ishapeの欠損値をなくす  447天体欠損　合計201,493天体
df = df.dropna(subset=['ishape'])

df['g-r'] = df.g_psfflux_mag - df.r_psfflux_mag
df['r-i'] = df.r_psfflux_mag - df.i_psfflux_mag
df['i-z'] = df.i_psfflux_mag - df.z_psfflux_mag
df['z-y'] = df.z_psfflux_mag - df.y_psfflux_mag

psf = ['g_psfflux_mag','r_psfflux_mag', 'i_psfflux_mag', 'z_psfflux_mag', 'y_psfflux_mag']
color = ['g-r', 'r-i', 'i-z', 'z-y']
feature_columns = psf + ['ishape'] + color
feature_labels = ['g','r','i','z','y', '$\mathrm{ishape}$', 'g-r', 'r-i', 'i-z', 'z-y']
place = 'psf_ishape_color'

print('features used are:')
print(df[feature_columns].columns)

data_prep_dict_all = prepare_data(df, feature_columns, train_percent)
pipeline = RF_fit(data_prep_dict_all, n_estimators, n_jobs=-1)
classes_pred_all = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=False) #<class 'numpy.ndarray'>
classes_pred_all_proba = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=True) #<class 'numpy.ndarray'>
report_df, cm_df = metrics(data_prep_dict_all, classes_pred_all)

report_df #違うセルでの実行推奨
cm_df #違うセルでの実行推奨

save_obj(pipeline, 'HSC_ML_save/psf_ishape_color/rf_pipeline') # pipelineのclassificationが入ってる
save_obj(data_prep_dict_all, 'HSC_ML_save/psf_ishape_color/data_prep_dict_all') #訓練データとトレーニングデータに分けたもの
save_obj(classes_pred_all, 'HSC_ML_save/psf_ishape_color/classes_pred_all') #テストデータをモデルにフィットさせ、返ってきたテストデータのクラス
save_obj(classes_pred_all_proba,'HSC_ML_save/psf_ishape_color/classes_pred_all_proba') #テストデータのクラスの分類確率

df_predclass = pd.DataFrame(classes_pred_all, index=data_prep_dict_all['features_test'].index, columns=['class_pred'])
df = df.join(df_predclass, how='left')

df_proba = pd.DataFrame(classes_pred_all_proba, index=data_prep_dict_all['features_test'].index, columns=['prob_g', 'prob_q', 'prob_s'])
# Append probabilities to the original df for test data:
df = df.join(df_proba, how='left')
df['prob_best'] = df[['prob_g', 'prob_q', 'prob_s']].max(axis=1) # max(axis=1)で水平方向の最大を求めることで、prob_bestに、最大値が入る

save_obj(df, 'HSC_ML_save/psf_ishape_color/df_spec_classprobs')

features used are:
Index(['g_psfflux_mag', 'r_psfflux_mag', 'i_psfflux_mag', 'z_psfflux_mag',
       'y_psfflux_mag', 'ishape', 'g-r', 'r-i', 'i-z', 'z-y'],
      dtype='object')
Fitting a random forest model to the data...
Classifying objects using random forest model...
Classifying objects using random forest model...


In [25]:
# # 特徴量[cmodel , ishape, color]

# # ishapeの欠損値をなくす  447天体欠損　合計201,493天体
# df = df.dropna(subset=['ishape'])

# df['g-r'] = df.g_psfflux_mag - df.r_psfflux_mag
# df['r-i'] = df.r_psfflux_mag - df.i_psfflux_mag
# df['i-z'] = df.i_psfflux_mag - df.z_psfflux_mag
# df['z-y'] = df.z_psfflux_mag - df.y_psfflux_mag

# cmodel = ['g_cmodel_mag', 'r_cmodel_mag', 'i_cmodel_mag', 'z_cmodel_mag', 'y_cmodel_mag']
# color = ['g-r', 'r-i', 'i-z', 'z-y']
# feature_columns = cmodel + ['ishape'] + color
# feature_labels = ['g','r','i','z','y', '$\mathrm{ishape}$', 'g-r', 'r-i', 'i-z', 'z-y']
# place = 'cmodel_ishape_color'

# print('features used are:')
# print(df[feature_columns].columns)

# data_prep_dict_all = prepare_data(df, feature_columns, train_percent)
# pipeline = RF_fit(data_prep_dict_all, n_estimators, n_jobs=-1)
# classes_pred_all = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=False) #<class 'numpy.ndarray'>
# classes_pred_all_proba = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=True) #<class 'numpy.ndarray'>
# report_df, cm_df = metrics(data_prep_dict_all, classes_pred_all)

# report_df #違うセルでの実行推奨
# cm_df #違うセルでの実行推奨

# save_obj(pipeline, 'HSC_ML_save/'+place+'/rf_pipeline') # pipelineのclassificationが入ってる
# save_obj(data_prep_dict_all, 'HSC_ML_save/'+place+'/data_prep_dict_all') #訓練データとトレーニングデータに分けたもの
# save_obj(classes_pred_all, 'HSC_ML_save/'+place+'/classes_pred_all') #テストデータをモデルにフィットさせ、返ってきたテストデータのクラス
# save_obj(classes_pred_all_proba,'HSC_ML_save/'+place+'/classes_pred_all_proba') #テストデータのクラスの分類確率

# df_predclass = pd.DataFrame(classes_pred_all, index=data_prep_dict_all['features_test'].index, columns=['class_pred'])
# df = df.join(df_predclass, how='left')

# df_proba = pd.DataFrame(classes_pred_all_proba, index=data_prep_dict_all['features_test'].index, columns=['prob_g', 'prob_q', 'prob_s'])
# # Append probabilities to the original df for test data:
# df = df.join(df_proba, how='left')
# df['prob_best'] = df[['prob_g', 'prob_q', 'prob_s']].max(axis=1) # max(axis=1)で水平方向の最大を求めることで、prob_bestに、最大値が入る

# save_obj(df, 'HSC_ML_save/'+place+'/df_spec_classprobs')

In [26]:
# # 特徴量[ishape, color]

# # ishapeの欠損値をなくす  447天体欠損　合計201,493天体
# df = df.dropna(subset=['ishape'])

# df['g-r'] = df.g_psfflux_mag - df.r_psfflux_mag
# df['r-i'] = df.r_psfflux_mag - df.i_psfflux_mag
# df['i-z'] = df.i_psfflux_mag - df.z_psfflux_mag
# df['z-y'] = df.z_psfflux_mag - df.y_psfflux_mag

# color = ['g-r', 'r-i', 'i-z', 'z-y']
# feature_columns = ['ishape'] + color
# feature_labels = ['$\mathrm{ishape}$', 'g-r', 'r-i', 'i-z', 'z-y']

# print('features used are:')
# print(df[feature_columns].columns)

# data_prep_dict_all = prepare_data(df, feature_columns, train_percent)
# pipeline = RF_fit(data_prep_dict_all, n_estimators, n_jobs=-1)
# classes_pred_all = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=False) #<class 'numpy.ndarray'>
# classes_pred_all_proba = RF_classify(pipeline, data_prep_dict_all, n_jobs=-1, proba=True) #<class 'numpy.ndarray'>
# report_df, cm_df = metrics(data_prep_dict_all, classes_pred_all)

# save_obj(pipeline, 'HSC_ML_save/ishape_color/rf_pipeline') # pipelineのclassificationが入ってる
# save_obj(data_prep_dict_all, 'HSC_ML_save/ishape_color/data_prep_dict_all') #訓練データとトレーニングデータに分けたもの
# save_obj(classes_pred_all, 'HSC_ML_save/ishape_color/classes_pred_all') #テストデータをモデルにフィットさせ、返ってきたテストデータのクラス
# save_obj(classes_pred_all_proba,'HSC_ML_save/ishape_color/classes_pred_all_proba') #テストデータのクラスの分類確率

# df_predclass = pd.DataFrame(classes_pred_all, index=data_prep_dict_all['features_test'].index, columns=['class_pred'])
# df = df.join(df_predclass, how='left')

# df_proba = pd.DataFrame(classes_pred_all_proba, index=data_prep_dict_all['features_test'].index, columns=['prob_g', 'prob_q', 'prob_s'])
# # Append probabilities to the original df for test data:
# df = df.join(df_proba, how='left')
# df['prob_best'] = df[['prob_g', 'prob_q', 'prob_s']].max(axis=1) # max(axis=1)で水平方向の最大を求めることで、prob_bestに、最大値が入る

# save_obj(df, 'HSC_ML_save/ishape_color/df_spec_classprobs')

In [13]:
train_vs_f1score(df, place, sampleG=True)
train_vs_f1score(df, place, sampleG=False)

Looping over these possible train percentages: [0.001, 0.003, 0.01, 0.06, 0.12, 0.2, 0.4, 0.6, 0.8, 1.0]
train percent is: 0.001
number of sources available for training 100
sampling galaxies to fix class imbalance...
Training on 32... G: 14, Q: 14, S: 4
  f1score   [0.95320268 0.74790298 0.70767949]
 precision  [0.96126144 0.68812444 0.86372881]
    recall     [0.94527792 0.81905576 0.59938838]
------------------------------
train percent is: 0.003
number of sources available for training 302
sampling galaxies to fix class imbalance...
Training on 97... G: 42, Q: 42, S: 13
  f1score   [0.98396951 0.88924969 0.83882192]
 precision  [0.98641006 0.86353573 0.88909378]
    recall     [0.981541   0.91654205 0.79393084]
------------------------------
train percent is: 0.01
number of sources available for training 1007
sampling galaxies to fix class imbalance...
Training on 321... G: 138, Q: 140, S: 43
  f1score   [0.98797024 0.90944868 0.86939712]
 precision  [0.98885851 0.89768313 0.893691

## Cross Vadiation

In [1]:
# data_prep_dict_all = prepare_data(df, feature_columns, train_percent=0.5)


# # n_estimators
# print('cross-validating...')
# all_scores = []
# for n_estimators in [20, 50, 100, 200, 500, 1000]:
#     print(n_estimators)
#     rfc = RandomForestClassifier(n_jobs=-1, n_estimators=n_estimators, random_state=0, class_weight='balanced')
#     scores = cross_validate(rfc, data_prep_dict_all['features_train'], data_prep_dict_all['classes_train'], scoring='f1_weighted', cv=5, n_jobs=-1, return_train_score=True)
#     all_scores.append([n_estimators, scores])
#     print('-'*30)
    
# print('cv_scores_leaf')
# print(all_scores)



# # max_features
# print('cross-validating...')
# all_scores = []
# for feat in [2,3,4,5,6]:
#     print(feat)
#     rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, max_features=feat, random_state=0, class_weight='balanced')
#     scores = cross_validate(rfc, data_prep_dict_all['features_train'], data_prep_dict_all['classes_train'], scoring='f1_weighted', cv=5, n_jobs=-1, return_train_score=True)
#     all_scores.append([feat, scores])
#     print('-'*30)
    
    
    
# # min_samples_leaf
# print('cross-validating...')
# all_scores = []
# for leaf in [1,5, 10, 50, 100, 500]:
#     print(leaf)
#     rfc = RandomForestClassifier(n_jobs=-1, n_estimators=100, min_samples_leaf=leaf, random_state=0, class_weight='balanced')
#     scores = cross_validate(rfc, data_prep_dict_all['features_train'], data_prep_dict_all['classes_train'], scoring='f1_weighted', cv=5, n_jobs=-1, return_train_score=True)
#     all_scores.append([leaf, scores])
#     print('-'*30)