In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
meta_data = pd.read_csv('../input/training_set_metadata.csv')
test_meta_data = pd.read_csv('../input/test_set_metadata.csv')

In [None]:
classes = np.unique(meta_data['target'])
classes_all = np.hstack([classes, [99]])

# create a dictionary {class : index} to map class number with the index 
# (index will be used for submission columns like 0, 1, 2 ... 14)
target_map = {j:i for i, j in enumerate(classes_all)}

# create 'target_id' column to map with 'target' classes
target_ids = [target_map[i] for i in meta_data['target']]
meta_data['target_id'] = target_ids
meta_data.head()
#meta_data['hostgal_specz']

In [None]:
# Build probability arrays for both the galactic and extragalactic groups
galactic_cut = meta_data['hostgal_specz'] == 0
galactic_data = meta_data[galactic_cut]
extragalactic_data = meta_data[~galactic_cut]

galactic_classes = np.unique(galactic_data['target_id'])
extragalactic_classes = np.unique(extragalactic_data['target_id'])

# add class_99 (index = 14)
galactic_classes = np.append(galactic_classes, 14)
extragalactic_classes = np.append(extragalactic_classes, 14)

***
# EDA

In [None]:
plt.figure(figsize=(20,20))

plt.subplot(221)
plt.scatter(meta_data[~galactic_cut]['hostgal_specz'], meta_data[~galactic_cut]['target'])
plt.xlabel('hostgal_specz')
plt.ylabel('classes')
plt.yticks(classes_all)

plt.subplot(222)
plt.scatter(meta_data['hostgal_specz'], meta_data['target'])
plt.xlabel('hostgal_specz')
plt.ylabel('classes')
plt.yticks(classes_all)

plt.show()

In [None]:
plt.figure(figsize=(20,20))

plt.subplot(221)
plt.scatter(meta_data[~galactic_cut]['hostgal_photoz'], meta_data[~galactic_cut]['target'])
plt.xlabel('hostgal_photoz')
plt.ylabel('classes')
plt.yticks(classes_all)

plt.subplot(222)
plt.scatter(meta_data['hostgal_photoz'], meta_data['target'])
plt.xlabel('hostgal_photoz')
plt.ylabel('classes')
plt.yticks(classes_all)

plt.show()

In [None]:
plt.figure(figsize=(20,20))

plt.subplot(221)
plt.scatter(meta_data[~galactic_cut]['distmod'], meta_data[~galactic_cut]['target'])
plt.xlabel('distmod')
plt.ylabel('classes')
plt.yticks(classes_all)

plt.subplot(222)
plt.scatter(meta_data['distmod'], meta_data['target'])
plt.xlabel('distmod')
plt.ylabel('classes')
plt.yticks(classes_all)

plt.show()

In [None]:
plt.figure(figsize=(20,20))

plt.subplot(221)
plt.scatter(meta_data[~galactic_cut]['mwebv'], meta_data[~galactic_cut]['target'])
plt.xlabel('mwebv')
plt.ylabel('classes')
plt.yticks(classes_all)

plt.subplot(222)
plt.scatter(meta_data['mwebv'], meta_data['target'])
plt.xlabel('mwebv')
plt.ylabel('classes')
plt.yticks(classes_all)

plt.show()

In [None]:
#color = meta_data[meta_data['hostgal_photoz_err'] > 0.35]
meta_data['photoz_big_err'] = 0
meta_data.loc[meta_data['hostgal_photoz_err'] > 0.35, 'photoz_big_err'] = 1

In [None]:
meta_data.describe()

In [None]:
plt.figure(figsize=(20,20))

color = meta_data['photoz_big_err']
color = meta_data['photoz_big_err']

plt.subplot(221)
plt.scatter(meta_data['hostgal_specz'], meta_data['hostgal_photoz'], c = color)
plt.xlabel('hostgal_specz')
plt.ylabel('hostgal_photoz')

plt.subplot(222)
plt.scatter(meta_data['hostgal_specz'], meta_data['hostgal_photoz_err'], c = color)
plt.xlabel('hostgal_specz')
plt.ylabel('hostgal_photoz_err')
plt.yticks(np.arange(0,2,0.1))

plt.subplot(223)
plt.scatter(meta_data['hostgal_photoz'], meta_data['hostgal_photoz_err'], c = color)
plt.xlabel('hostgal_photoz')
plt.ylabel('hostgal_photoz_err')

In [None]:
#test_meta_data[test_meta_data['hostgal_specz'].isnull()]

In [None]:
plt.figure(figsize=(15,15))

plt.subplot(221)
plt.scatter(meta_data['hostgal_photoz'], meta_data['target'])
plt.xlabel('hostgal_photoz')
plt.ylabel('classes')
plt.yticks(classes_all)

plt.subplot(222)
plt.scatter(meta_data[meta_data['photoz_big_err'] == 0]['hostgal_photoz'], meta_data[meta_data['photoz_big_err'] == 0]['target'])
plt.xlabel('hostgal_photoz')
plt.ylabel('classes')
plt.yticks(classes_all)
plt.xticks(np.arange(0,3,0.1))

plt.show()

first: (hostgal_specz >= 1.1) => class 88, 95, 99 (10, 13, 14)

second: (hostgal_photoz >= 1.1 & photoz_big_err = 0) => class 88, 95, 99 (10, 13, 14)

third: other even probabilities counted on the previous rounds



***

# Weights

Weights are based on this discussion: https://www.kaggle.com/c/PLAsTiCC-2018/discussion/67194 , but, apparently, we have different weights for Galactic and Extragalactic groups for the class_99!

It is also good to check this kernel for more precise calculation of weights: https://www.kaggle.com/ganfear/calculate-exact-class-weights

In [None]:
# Weighted probabilities for Milky Way galaxy
galactic_probabilities = np.zeros(15)
for x in galactic_classes:
    if(x == 14):
        galactic_probabilities[x] = 0.014845745
        continue
    if(x == 5):
        galactic_probabilities[x] = 0.196867058
        continue
    galactic_probabilities[x] = 0.197071799

In [None]:
# Weighted probabilities for Extra Galaxies
extragalactic_probabilities = np.zeros(15)
for x in extragalactic_classes:
    if(x == 14):
        extragalactic_probabilities[x] = 0.147286644
        continue
    if(x == 7):
        extragalactic_probabilities[x] = 0.15579259
        continue
    if(x == 1):
        extragalactic_probabilities[x] = 0.155388186
        continue
    if(x == 10 or x == 13):
        extragalactic_probabilities[x] = 0.076512622
        continue
    extragalactic_probabilities[x] = 0.077701467

In [None]:
# Weighted probabilities for Remote Classes
bigz_probabilities = np.zeros(15)
for x in extragalactic_classes:
    if(x == 14):
        bigz_probabilities[x] = 0.398923589
        continue
    if(x == 10 or x == 13):
        bigz_probabilities[x] = 0.207233249
        continue
    if(x == 7):
        extragalactic_probabilities[x] = 0.041550573
        continue
    if(x == 1):
        extragalactic_probabilities[x] = 0.041442716
        continue
    bigz_probabilities[x] = 0.020723325

#p = (1 - (5*0.077340579/2 + 0.154666479/2 + 0.155069005/2 + 0.148880461/2))/2
#p = 0.28867029

***

In [None]:
#test_meta_data['object_id'].count()
#test_meta_data[test_meta_data['hostgal_specz'] >= 1.1]['object_id'].count()
#( np.isnan(row['hostgal_specz']) ) and (row['hostgal_photoz'] >= 1.2 and row['hostgal_photoz_err'] <= 0.3
#test_meta_data[(test_meta_data['hostgal_photoz'] >= 1.1) & (test_meta_data['hostgal_photoz_err'] <= 0.35)]['object_id'].count()
#x = 84239 / 3492890 * 0.7037
#x = int(x)
#y = 1 - x
#print(x, y)
#type(x)

In [None]:
# Apply this prediction to test_meta_data table
import tqdm
def do_prediction(table):
    probs = []
    for index, row in tqdm.tqdm(table.iterrows(), total=len(table)):
        if ( row['hostgal_specz'] >= 1.2 ):
            prob = bigz_probabilities
        elif ( ( np.isnan(row['hostgal_specz']) ) and (row['hostgal_photoz'] >= 1.2 and row['hostgal_photoz_err'] <= 0.3) ):
            prob = bigz_probabilities
        elif ( row['hostgal_photoz'] == 0 ):
            prob = galactic_probabilities
        else:
            prob = extragalactic_probabilities
        probs.append(prob)
    return np.array(probs)

test_pred = do_prediction(test_meta_data)

In [None]:
test_df = pd.DataFrame(index=test_meta_data['object_id'], data=test_pred, columns=['class_%d' % i for i in classes_all])
test_df.to_csv('./submission_eda.csv')