 # **Progetto ICON**

# **Habitable exoplanets classification**



Descrizione del progetto

In [None]:
# Import libraries
import imp
import pandas as pd
import numpy as np


from sklearn.impute import SimpleImputer

from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.model_selection import StratifiedKFold
from imblearn.over_sampling import SMOTE, SMOTENC

from sklearn.pipeline import Pipeline

# Machine Learning Algorithms
#import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, IsolationForest,
                              RandomForestRegressor, AdaBoostClassifier, VotingClassifier, ExtraTreesClassifier)
from sklearn.linear_model import Perceptron


# Performance metrics
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import accuracy_score, make_scorer, balanced_accuracy_score
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_curve, auc, confusion_matrix, roc_auc_score
from sklearn.metrics import precision_score, recall_score, precision_recall_curve,f1_score, fbeta_score

import seaborn as sns
import matplotlib.pyplot as plt

## **Load Data**

In [None]:
planets = pd.read_csv("../PHL-EC.csv")
planets.head()

## **Data Preparation**

Elimino le tipologie di pianeti per le quali abbiamo pochi esempi

In [None]:
target_count = planets['P. Habitable Class'].value_counts()
target_count

In [None]:
indexNames = planets[planets['P. Habitable Class'] == 'thermoplanet'].index
planets.drop(indexNames, inplace= True)


indexNames = planets[planets['P. Habitable Class'] == 'hypopsychroplanet'].index
planets.drop(indexNames, inplace= True)

In [None]:
target_count = planets['P. Habitable Class'].value_counts()
target_count

In [None]:
planets.reset_index(inplace = True, drop= True)

Rimuovo alcune feauture segiendo alcune tecniche di feauture selection

In [None]:
#count = valori nulli in una colonna specifica(in una feature)
#lenplanet(planet) = numero di righe totali(tot pianeti)
#count/len(planet) = ci dice in percentuale quanti valori ci saranno in quella colonna

def remove_missing(feauture):
    count = len(planets[planets[feauture].isnull()])
    if count/len(planets) > 0.2:
        return 1
    else:
        return 0
    
missing_values = [x for x in planets.columns if remove_missing(x)]
planets = planets.drop(missing_values, axis=1)

In [None]:
cols_to_drop = ['S. Constellation', 'S. Type', 'P. Int ESI', 
                'P. Surf ESI', 'P. Disc. Method', 'P. Disc. Year','P. Hab Moon', 'P. SFlux Min (EU)', 'P. SFlux Max (EU)',
                'P. Teq Min (K)','P. Teq Max (K)','P. SFlux Mean (EU)','S. Name']

planets = planets.drop(cols_to_drop, axis=1)

Voglio rimuovere anche in base alla correlazione con l'attributo habitability class ma per usare la correlazione devo: 

*    riempire il dataset con i dati mancati
*    traformare prima la feature habitable class (categorica) in feauture numerica




riempio tramite l'inputer



In [None]:
planets.isnull().sum()

In [None]:
numeric_values = planets[planets._get_numeric_data().columns]


imputer = SimpleImputer(missing_values = np.NaN, strategy = 'mean') #di default strategy ='mean'
numeric_values = pd.DataFrame(imputer.fit_transform(numeric_values), columns=numeric_values.columns)
numeric_values.to_csv('Imputed Data.csv', index='rowid')

In [None]:
imputed_numerics = pd.read_csv('Imputed Data.csv')

In [None]:
for i in planets.columns:
    if i not in imputed_numerics.columns:
        pass
    else:
        planets[i] = imputed_numerics[i]

In [None]:
catCols = [col for col in planets.columns if planets[col].dtype=="O"]

In [None]:
simp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
planets[catCols] = simp.fit_transform(planets[catCols])
planets.dropna(how='any', axis=0, inplace=True)

In [None]:
planets.isnull().sum()

trasformo la feauture categorica

#Convert string values of origin column to numerical values
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(planets['P. Habitable Class'])

# finding the unique classes
print(list(label_encoder.classes_))
print()

# values after transforming the categorical column.
print(label_encoder.transform(planets['P. Habitable Class']))

planets['P. Habitable Class'] = label_encoder.transform(planets['P. Habitable Class'])

cor = planets.corr('spearman')
cor.head()

sns.heatmap(cor, annot = False)

threshold = 0
cor_h = cor['P. Habitable Class'].sort_values(ascending=False)
result = cor_h.tail(20)
result

Possiamo notare come il dataset sia molto sblianciato

In [None]:
target_count = planets['P. Habitable Class'].value_counts()
target_count

In [None]:
target_count = planets['P. Habitable Class'].value_counts()
print(f'non-habitable: {target_count[0]}')
print(f'mesoplanet: {target_count[1]}')
print(f'psychroplanet : {target_count[2]}')
print(f'Percentage of Majority Class: {round(target_count[0] / sum(target_count), 4)*100}')
print(f'Percentage of Minority Class: {round(target_count[1] / sum(target_count), 4)*100}')
print(f'Percentage of Minority Class: {round(target_count[2] / sum(target_count), 4)*100}')

Divido il dataset 

X = Predictor features
y = target feature

In [None]:
planets_c = planets.copy()
planets_c

In [None]:

X = planets_c.loc[ :, planets.columns != 'P. Habitable Class']
y = planets.iloc[:, 5]

X = X.values

Ora definiamo una pipeline per settare l'oversampling usando SMOTE sui dati di training , ad ogni cross-validation evaluation process

In [None]:
steps = [('over', SMOTE()), ('model', LogisticRegression())] #mettiamo il modello di classificazione
pipeline = Pipeline(steps=steps)

Ora usiamo stratified k-fold cross-validation per dividere il nostro dat in più folds

In [None]:
skf = StratifiedKFold(n_splits=3,random_state=None,shuffle=False)

for train_index,test_index in skf.split(X,y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    
    print("Train X :", X_train, "|", "Test X :", X_test)
    print("Train y :", y_train, "|", "Test y :", y_test)

effettuo oversampling

In [None]:
catIndex=[0, 1, 2, 3, 4, 5]
catIndex

In [None]:
dict = {'psychroplanet': 2000,
    'mesoplanet': 2000}

In [None]:
#facciamo l'oversampling
smote = SMOTENC(random_state = 11, sampling_strategy = dict, categorical_features = catIndex)
X_train, y_train = smote.fit_resample(X_train, y_train)



In [None]:
Y = pd.DataFrame(y_train, columns = ['P. Habitable Class'])

Il dataset non è più sbilanciato

In [None]:
target_count = Y['P. Habitable Class'].value_counts()
print(f'non-habitable: {target_count[0]}')
print(f'mesoplanet: {target_count[1]}')
print(f'psychroplanet : {target_count[2]}')
print(f'Percentage of Majority Class: {round(target_count[0] / sum(target_count), 4)*100}')
print(f'Percentage of Minority Class: {round(target_count[1] / sum(target_count), 4)*100}')
print(f'Percentage of Minority Class: {round(target_count[2] / sum(target_count), 4)*100}')

prima di poter effettuare la classificazione devo trasformare le feature categoriche in numeriche;
trasformo la feature target tramite l'encoder e quelle normali tramite altri metodi, quali????

In [None]:
model = GaussianNB()
model.fit(X_train, y_train)
y_pre =model.predic(X_test)