In [4]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, precision_score,recall_score, f1_score, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore")


In [13]:
df = pd.read_csv('sdss_100k_galaxy_form_burst.csv', header =1)
df.head()

Unnamed: 0,objid,specobjid,ra,dec,u,g,r,i,z,modelFlux_u,...,psfMag_z,expAB_u,expAB_g,expAB_r,expAB_i,expAB_z,class,subclass,redshift,redshift_err
0,1237646587710669400,8175185722644649984,82.038679,0.847177,21.73818,20.26633,19.32409,18.64037,18.23833,2.007378,...,19.43575,0.099951,0.311864,0.28937,0.270588,0.187182,GALAXY,STARFORMING,0.067749,1.5e-05
1,1237646588247540577,8175186822156277760,82.138894,1.063072,20.66761,19.32016,18.67888,18.24693,18.04122,5.403369,...,18.85012,0.366549,0.516876,0.517447,0.552297,0.636966,GALAXY,STARFORMING,0.105118,1e-05
2,1237646588247540758,8175187097034184704,82.02851,1.104003,23.63531,21.19671,19.92297,19.31443,18.68396,0.295693,...,19.42235,0.05,0.417137,0.50695,0.549881,0.370166,GALAXY,STARFORMING,0.234089,3e-05
3,1237648702973083853,332152325571373056,198.544469,-1.097059,20.12374,18.4152,17.47202,17.05297,16.72423,8.920645,...,18.03204,0.310763,0.356827,0.389345,0.38816,0.41666,GALAXY,STARFORMING,0.110825,3e-05
4,1237648702973149350,332154249716721664,198.706864,-1.046217,-9999.0,-9999.0,18.37762,18.13383,17.78497,0.0,...,19.0288,-9999.0,-9999.0,0.05,0.05,0.149973,GALAXY,STARFORMING,0.136658,2.1e-05


In [16]:
#checking shape of dataset
df.shape

(100000, 43)

In [18]:
#info about dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 43 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   objid         100000 non-null  int64  
 1   specobjid     100000 non-null  uint64 
 2   ra            100000 non-null  float64
 3   dec           100000 non-null  float64
 4   u             100000 non-null  float64
 5   g             100000 non-null  float64
 6   r             100000 non-null  float64
 7   i             100000 non-null  float64
 8   z             100000 non-null  float64
 9   modelFlux_u   100000 non-null  float64
 10  modelFlux_g   100000 non-null  float64
 11  modelFlux_r   100000 non-null  float64
 12  modelFlux_i   100000 non-null  float64
 13  modelFlux_z   100000 non-null  float64
 14  petroRad_u    100000 non-null  float64
 15  petroRad_g    100000 non-null  float64
 16  petroRad_i    100000 non-null  float64
 17  petroRad_r    100000 non-null  float64
 18  petro

In [20]:
#checking for null values
df.isnull().sum()

objid           0
specobjid       0
ra              0
dec             0
u               0
g               0
r               0
i               0
z               0
modelFlux_u     0
modelFlux_g     0
modelFlux_r     0
modelFlux_i     0
modelFlux_z     0
petroRad_u      0
petroRad_g      0
petroRad_i      0
petroRad_r      0
petroRad_z      0
petroFlux_u     0
petroFlux_g     0
petroFlux_i     0
petroFlux_r     0
petroFlux_z     0
petroR50_u      0
petroR50_g      0
petroR50_i      0
petroR50_r      0
petroR50_z      0
psfMag_u        0
psfMag_r        0
psfMag_g        0
psfMag_i        0
psfMag_z        0
expAB_u         0
expAB_g         0
expAB_r         0
expAB_i         0
expAB_z         0
class           0
subclass        0
redshift        0
redshift_err    0
dtype: int64

In [21]:
#oridinal encoding - replace subclass with a 0/1 for classification
df['subclass'].replace(['STARFORMING','STARBURST'],[0,1],inplace=True)

In [22]:
#statistical information about dataset - descriptive statistical
df.describe()

Unnamed: 0,objid,specobjid,ra,dec,u,g,r,i,z,modelFlux_u,...,psfMag_i,psfMag_z,expAB_u,expAB_g,expAB_r,expAB_i,expAB_z,subclass,redshift,redshift_err
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,...,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,1.237659e+18,2.303595e+18,180.577802,23.472475,18.518622,17.258221,16.821739,16.362611,15.850865,30.683321,...,18.020203,17.435735,-0.603667,-0.522111,-0.309462,-0.410153,-0.740964,0.25007,0.116753,0.000179
std,6103756000000.0,2.531359e+18,75.751994,21.140744,105.082004,105.069066,95.035474,100.171155,114.206165,76.552859,...,100.181687,114.218604,104.870665,104.871474,94.860919,99.991654,114.005927,0.433055,0.100169,0.052189
min,1.237646e+18,2.994897e+17,0.008745,-11.244273,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-47.45172,...,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.0,-0.000833,2e-06
25%,1.237655e+18,8.130687e+17,138.74188,3.120118,18.762215,17.505868,16.898845,16.527097,16.281327,9.288132,...,18.295627,17.991602,0.299999,0.398705,0.418789,0.418656,0.381288,0.0,0.055836,8e-06
50%,1.237659e+18,1.457564e+18,181.492972,20.913596,19.349715,18.07264,17.45908,17.091385,16.861105,18.19569,...,18.84578,18.563315,0.508688,0.588335,0.604795,0.604254,0.575397,0.0,0.08585,1.1e-05
75%,1.237663e+18,2.367902e+18,223.851863,42.259965,20.07947,18.656182,17.926918,17.59265,17.453848,31.259628,...,19.586577,19.29943,0.699907,0.768804,0.773924,0.773119,0.752311,1.0,0.135148,1.5e-05
max,1.237681e+18,1.412691e+19,359.997922,68.695258,30.96,30.42098,31.17356,30.56236,28.55324,7915.306,...,25.96668,27.04328,1.0,1.0,0.999999,1.0,0.999998,1.0,0.572899,16.50371
