 # **Progetto ICON**

# **Habitable exoplanets classification**



Descrizione del progetto

In [44]:
# Import libraries
import imp
import pandas as pd
import numpy as np


from sklearn.impute import SimpleImputer

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, SMOTENC

from sklearn.pipeline import Pipeline

# Machine Learning Algorithms
#import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, IsolationForest,
                              RandomForestRegressor, AdaBoostClassifier, VotingClassifier, ExtraTreesClassifier)
from sklearn.linear_model import Perceptron
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Performance metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, make_scorer, balanced_accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_curve, auc, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score

import seaborn as sns
import matplotlib.pyplot as plt

## **Load Data**

In [45]:
planets = pd.read_csv("../PHL-EC.csv")
planets.head()

Unnamed: 0,P. Name,P. Name Kepler,P. Name KOI,P. Zone Class,P. Mass Class,P. Composition Class,P. Atmosphere Class,P. Habitable Class,P. Min Mass (EU),P. Mass (EU),...,P. Int ESI,P. Surf ESI,P. ESI,S. HabCat,P. Habitable,P. Hab Moon,P. Confirmed,P. Disc. Method,P. Disc. Year,Unnamed: 68
0,1RXS 1609 b,,,Cold,Jovian,gas,hydrogen-rich,non-habitable,,4451.16,...,0,0,0.05,0,0,0,1,Imaging,2008,
1,1SWASP J1407 b,,,Cold,Jovian,gas,hydrogen-rich,non-habitable,6358.8,6358.8,...,0,0,0.07,0,0,0,1,Primary Transit,2012,
2,2M 0103-55(AB) b,,,Cold,Jovian,gas,hydrogen-rich,non-habitable,4133.22,4133.22,...,0,0,0.06,0,0,0,1,Imaging,2013,
3,2M 0122-24 b,,,Cold,Jovian,gas,hydrogen-rich,non-habitable,,6358.8,...,0,0,0.08,0,0,0,1,Imaging,2013,
4,2M 0219-39 b,,,Cold,Jovian,gas,hydrogen-rich,non-habitable,,4419.37,...,0,0,0.06,0,0,0,1,Imaging,2015,


## **Data Preparation**

Elimino le tipologie di pianeti per le quali abbiamo pochi esempi

In [46]:
target_count = planets['P. Habitable Class'].value_counts()
target_count

non-habitable        3820
mesoplanet             31
psychroplanet          18
thermoplanet            3
hypopsychroplanet       3
Name: P. Habitable Class, dtype: int64

In [47]:
indexNames = planets[planets['P. Habitable Class'] == 'thermoplanet'].index
planets.drop(indexNames, inplace= True)


indexNames = planets[planets['P. Habitable Class'] == 'hypopsychroplanet'].index
planets.drop(indexNames, inplace= True)

In [48]:
target_count = planets['P. Habitable Class'].value_counts()
target_count

non-habitable    3820
mesoplanet         31
psychroplanet      18
Name: P. Habitable Class, dtype: int64

In [49]:
planets.reset_index(inplace = True, drop= True)

Rimuovo alcune feauture segiendo alcune tecniche di feauture selection

In [50]:
#count = valori nulli in una colonna specifica(in una feature)
#lenplanet(planet) = numero di righe totali(tot pianeti)
#count/len(planet) = ci dice in percentuale quanti valori ci saranno in quella colonna

def remove_missing(feauture):
    count = len(planets[planets[feauture].isnull()])
    if count/len(planets) > 0.2:
        return 1
    else:
        return 0
    
missing_values = [x for x in planets.columns if remove_missing(x)]
planets = planets.drop(missing_values, axis=1)

In [51]:
cols_to_drop = ['S. Constellation', 'S. Type', 'P. Int ESI', 
                'P. Surf ESI', 'P. Disc. Method', 'P. Disc. Year','P. Hab Moon', 'P. SFlux Min (EU)', 'P. SFlux Max (EU)',
                'P. Teq Min (K)','P. Teq Max (K)','P. SFlux Mean (EU)','S. Name']

planets = planets.drop(cols_to_drop, axis=1)

Voglio rimuovere anche in base alla correlazione con l'attributo habitability class ma per usare la correlazione devo: 

*    riempire il dataset con i dati mancati
*    traformare prima la feature habitable class (categorica) in feauture numerica




riempio tramite l'inputer



In [52]:
planets.isnull().sum()

P. Name                        0
P. Zone Class                 46
P. Mass Class                  6
P. Composition Class          41
P. Atmosphere Class           85
P. Habitable Class             0
P. Mass (EU)                  33
P. Radius (EU)                12
P. Density (EU)               41
P. Gravity (EU)               41
P. Esc Vel (EU)               41
P. Teq Mean (K)               46
P. Surf Press (EU)            41
P. Mag                        56
P. Appar Size (deg)           12
P. Period (days)             150
P. Sem Major Axis (AU)        35
P. Eccentricity                0
P. Mean Distance (AU)         35
P. Omega (deg)                 0
S. Mass (SU)                  47
S. Radius (SU)               112
S. Teff (K)                   99
S. Luminosity (SU)            28
S. RA (hrs)                    0
S. DEC (deg)                   0
S. Mag from Planet            46
S. Size from Planet (deg)    128
S. No. Planets                 0
S. No. Planets HZ              0
S. Hab Zon

In [53]:
numeric_values = planets[planets._get_numeric_data().columns]


imputer = SimpleImputer(missing_values = np.NaN, strategy = 'mean') #di default strategy ='mean'
numeric_values = pd.DataFrame(imputer.fit_transform(numeric_values), columns=numeric_values.columns)
numeric_values.to_csv('Imputed Data.csv', index='rowid')

In [54]:
imputed_numerics = pd.read_csv('Imputed Data.csv')

In [55]:
for i in planets.columns:
    if i not in imputed_numerics.columns:
        pass
    else:
        planets[i] = imputed_numerics[i]

In [56]:
catCols = [col for col in planets.columns if planets[col].dtype=="O"]

In [57]:
simp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
planets[catCols] = simp.fit_transform(planets[catCols])
planets.dropna(how='any', axis=0, inplace=True)

In [58]:
planets.isnull().sum()

P. Name                      0
P. Zone Class                0
P. Mass Class                0
P. Composition Class         0
P. Atmosphere Class          0
P. Habitable Class           0
P. Mass (EU)                 0
P. Radius (EU)               0
P. Density (EU)              0
P. Gravity (EU)              0
P. Esc Vel (EU)              0
P. Teq Mean (K)              0
P. Surf Press (EU)           0
P. Mag                       0
P. Appar Size (deg)          0
P. Period (days)             0
P. Sem Major Axis (AU)       0
P. Eccentricity              0
P. Mean Distance (AU)        0
P. Omega (deg)               0
S. Mass (SU)                 0
S. Radius (SU)               0
S. Teff (K)                  0
S. Luminosity (SU)           0
S. RA (hrs)                  0
S. DEC (deg)                 0
S. Mag from Planet           0
S. Size from Planet (deg)    0
S. No. Planets               0
S. No. Planets HZ            0
S. Hab Zone Min (AU)         0
S. Hab Zone Max (AU)         0
P. HZD  

trasformo la feauture categorica

#Convert string values of origin column to numerical values
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(planets['P. Habitable Class'])

# finding the unique classes
print(list(label_encoder.classes_))
print()

# values after transforming the categorical column.
print(label_encoder.transform(planets['P. Habitable Class']))

planets['P. Habitable Class'] = label_encoder.transform(planets['P. Habitable Class'])

cor = planets.corr('spearman')
cor.head()

sns.heatmap(cor, annot = False)

threshold = 0
cor_h = cor['P. Habitable Class'].sort_values(ascending=False)
result = cor_h.tail(20)
result

Possiamo notare come il dataset sia molto sblianciato

In [59]:
target_count = planets['P. Habitable Class'].value_counts()
target_count

non-habitable    3820
mesoplanet         31
psychroplanet      18
Name: P. Habitable Class, dtype: int64

In [60]:
target_count = planets['P. Habitable Class'].value_counts()
print(f'non-habitable: {target_count[0]}')
print(f'mesoplanet: {target_count[1]}')
print(f'psychroplanet : {target_count[2]}')
print(f'Percentage of Majority Class: {round(target_count[0] / sum(target_count), 4)*100}')
print(f'Percentage of Minority Class: {round(target_count[1] / sum(target_count), 4)*100}')
print(f'Percentage of Minority Class: {round(target_count[2] / sum(target_count), 4)*100}')

non-habitable: 3820
mesoplanet: 31
psychroplanet : 18
Percentage of Majority Class: 98.72999999999999
Percentage of Minority Class: 0.8
Percentage of Minority Class: 0.47000000000000003


Divido il dataset 

X = Predictor features
y = target feature

prima di poter effettuare la classificazione devo trasformare le feature categoriche in numeriche;
trasformo la feature target tramite l'encoder e quelle normali tramite altri metodi, quindi posso usare dopo SMOTE normale

In [61]:
planets_c = planets.copy()
planets_c

Unnamed: 0,P. Name,P. Zone Class,P. Mass Class,P. Composition Class,P. Atmosphere Class,P. Habitable Class,P. Mass (EU),P. Radius (EU),P. Density (EU),P. Gravity (EU),...,S. Hab Zone Min (AU),S. Hab Zone Max (AU),P. HZD,P. HZC,P. HZA,P. HZI,P. ESI,S. HabCat,P. Habitable,P. Confirmed
0,1RXS 1609 b,Cold,Jovian,gas,hydrogen-rich,non-habitable,4451.16,19.04,0.64,12.28,...,0.540,1.362,800.07,23.51,85.62,0.00,0.05,0.0,0.0,1.0
1,1SWASP J1407 b,Cold,Jovian,gas,hydrogen-rich,non-habitable,6358.80,10.94,4.86,53.12,...,0.461,1.143,9.07,15.30,45.41,0.02,0.07,0.0,0.0,1.0
2,2M 0103-55(AB) b,Cold,Jovian,gas,hydrogen-rich,non-habitable,4133.22,11.40,2.79,31.79,...,0.136,0.347,793.67,12.57,107.44,0.00,0.06,0.0,0.0,1.0
3,2M 0122-24 b,Cold,Jovian,gas,hydrogen-rich,non-habitable,6358.80,11.20,4.53,50.69,...,0.136,0.347,490.45,15.72,119.46,0.00,0.08,0.0,0.0,1.0
4,2M 0219-39 b,Cold,Jovian,gas,hydrogen-rich,non-habitable,4419.37,16.13,1.05,16.99,...,0.062,0.165,3028.82,19.46,133.25,0.00,0.06,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3864,YBP1194 b,Hot,Jovian,gas,metals-rich,non-habitable,108.10,7.97,0.21,1.70,...,0.743,1.751,-2.33,5.02,0.57,0.15,0.16,0.0,0.0,1.0
3865,YBP1514 b,Hot,Jovian,gas,metals-rich,non-habitable,127.18,8.34,0.22,1.83,...,0.658,1.552,-2.34,5.23,0.62,0.15,0.15,0.0,0.0,1.0
3866,YZ Cet b,Hot,Terran,rocky-iron,metals-rich,non-habitable,0.76,0.96,0.86,0.83,...,0.039,0.102,-1.72,-0.17,-0.91,0.34,0.43,0.0,0.0,1.0
3867,YZ Cet c,Hot,Terran,rocky-iron,metals-rich,non-habitable,0.99,1.04,0.88,0.91,...,0.039,0.102,-1.56,-0.17,-0.77,0.36,0.53,0.0,0.0,1.0


Faccio ENCODING delle colonne CATEGORICHE

In [101]:
planet_encoder = OneHotEncoder()

name_reshaped = np.array(planets['P. Name']).reshape(-1, 1)
name_values = planet_encoder.fit_transform(name_reshaped)

zone_reshaped = np.array(planets['P. Zone Class']).reshape(-1, 1)
zone_values = planet_encoder.fit_transform(zone_reshaped)

mass_reshaped = np.array(planets['P. Mass Class']).reshape(-1, 1)
mass_values = planet_encoder.fit_transform(mass_reshaped)

comp_reshaped = np.array(planets['P. Composition Class']).reshape(-1, 1)
comp_values = planet_encoder.fit_transform(comp_reshaped)

atmo_reshaped = np.array(planets['P. Atmosphere Class']).reshape(-1, 1)
atmo_values = planet_encoder.fit_transform(atmo_reshaped)

hab_reshaped = np.array(planets['P. Habitable Class']).reshape(-1, 1)
hab_values = planet_encoder.fit_transform(hab_reshaped)

#PRINT TEST
print(planets['P. Habitable Class'][:5])
print()
print(hab_values.toarray()[:5])
print()
print(planet_encoder.inverse_transform(hab_values)[:5])

#costruisco DataFrame temporaneo
p_name = pd.DataFrame(name_values, columns=['P. Name'])
p_zone = pd.DataFrame(zone_values, columns=['P. Zone Class'])
p_mass = pd.DataFrame(mass_values, columns=['P. Mass Class'])
p_comp = pd.DataFrame(comp_values, columns=['P. Composition Class'])
p_atmo = pd.DataFrame(atmo_values, columns=['P. Atmosphere Class'])
p_hab = pd.DataFrame(hab_values, columns=['P. Habitable Class'])

planets_categorical_encoded = pd.concat([p_name, p_zone, p_mass, p_comp, p_atmo, p_hab], axis=1)
print(planets_categorical_encoded.shape)
planets_categorical_encoded.head()

0    non-habitable
1    non-habitable
2    non-habitable
3    non-habitable
4    non-habitable
Name: P. Habitable Class, dtype: object

[[0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]

[['non-habitable']
 ['non-habitable']
 ['non-habitable']
 ['non-habitable']
 ['non-habitable']]
(3869, 6)


Unnamed: 0,P. Name,P. Zone Class,P. Mass Class,P. Composition Class,P. Atmosphere Class,P. Habitable Class
0,"(0, 7)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 1)\t1.0"
1,"(0, 8)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 1)\t1.0"
2,"(0, 12)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 1)\t1.0"
3,"(0, 13)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 1)\t1.0"
4,"(0, 14)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 0)\t1.0","(0, 1)\t1.0"


In [None]:

X = planets_c.loc[ :, planets.columns != 'P. Habitable Class']
y = planets_c.iloc[:, 5]

X = X.values

Ora definiamo una pipeline per settare l'oversampling usando SMOTE sui dati di training , ad ogni cross-validation evaluation process

In [None]:
steps = [('over', SMOTE()), ('model', LogisticRegression())] #mettiamo il modello di classificazione
pipeline = Pipeline(steps=steps)

Ora usiamo stratified k-fold cross-validation per dividere il nostro dat in più folds

In [None]:
skf = StratifiedKFold(n_splits=3,random_state=None,shuffle=False)

for train_index,test_index in skf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    
    print("Train X :", X_train, "|", "Test X :", X_test)
    print("Train y :", y_train, "|", "Test y :", y_test)

effettuo oversampling

In [None]:
catIndex=[0, 1, 2, 3, 4, 5]
catIndex

In [None]:
dict = {'psychroplanet': 2000,
    'mesoplanet': 2000}

In [None]:
#facciamo l'oversampling
smote = SMOTENC(random_state = 11, sampling_strategy = dict, categorical_features = catIndex)
X_train, y_train = smote.fit_resample(X_train, y_train)



In [None]:
Y = pd.DataFrame(y_train, columns = ['P. Habitable Class'])

Il dataset non è più sbilanciato

In [None]:
target_count = Y['P. Habitable Class'].value_counts()
print(f'non-habitable: {target_count[0]}')
print(f'mesoplanet: {target_count[1]}')
print(f'psychroplanet : {target_count[2]}')
print(f'Percentage of Majority Class: {round(target_count[0] / sum(target_count), 4)*100}')
print(f'Percentage of Minority Class: {round(target_count[1] / sum(target_count), 4)*100}')
print(f'Percentage of Minority Class: {round(target_count[2] / sum(target_count), 4)*100}')

Vedi questo https://analyticsindiamag.com/7-types-classification-algorithms/ per scegliere i modelli

In [72]:
planets.dtypes
planets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3869 entries, 0 to 3868
Data columns (total 40 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   P. Name                    3869 non-null   object 
 1   P. Zone Class              3869 non-null   object 
 2   P. Mass Class              3869 non-null   object 
 3   P. Composition Class       3869 non-null   object 
 4   P. Atmosphere Class        3869 non-null   object 
 5   P. Habitable Class         3869 non-null   object 
 6   P. Mass (EU)               3869 non-null   float64
 7   P. Radius (EU)             3869 non-null   float64
 8   P. Density (EU)            3869 non-null   float64
 9   P. Gravity (EU)            3869 non-null   float64
 10  P. Esc Vel (EU)            3869 non-null   float64
 11  P. Teq Mean (K)            3869 non-null   float64
 12  P. Surf Press (EU)         3869 non-null   float64
 13  P. Mag                     3869 non-null   float

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pre =model.predic(X_test)