<a href="https://colab.research.google.com/github/Dansah2/Identifying_Celestial_Objects/blob/main/1_EDA_Sloan_Digital_Sky_Survey_DR18.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Sloan Digital Sky Survey - DR18

This dataset consists of 100,000 observations from the Data Release (DR) 18 of the Sloan Digital Sky Survey (SDSS). Each observation is described by 42 features and 1 class column classifying the observation as either:

a STAR
a GALAXY
a QSO (Quasi-Stellar Object) or a Quasar.

Kaggle Dataset Download API Command:

kaggle datasets download -d diraf0/sloan-digital-sky-survey-dr18

#Project Outline:
1) Download the dataset

2) Explore/Analyze the data

3) Preprocess and organize the data for ML training

4) Set appropriate weights

5) Create and Train model

##Download / Read the Dataset
1) Install required libraries

2) Import required libraries

3) Download / Read data from Kaggle

###Install required libraries

In [None]:
!pip install kaggle numpy plotly > /dev/null 2>&1

###Import required libraries

In [None]:
# handeling data
import numpy as np
import pandas as pd

# graphing data
pd.options.plotting.backend = "plotly"
import plotly.graph_objects as go
import plotly.express as px

# downloading data
from google.colab import drive

# feature exploration
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LinearRegression

In [None]:
# Mount google drive to store Kaggle API for future use
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# make a directory for kaggle temporary instance location in Colab
! mkdir ~/.kaggle

In [None]:
# upload json fine to Google drive and copy the temporary location
!cp /content/drive/MyDrive/Kaggle_API/kaggle.json ~/.kaggle/kaggle.json

In [None]:
# change the file permissions to read/write to the owner only
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d diraf0/sloan-digital-sky-survey-dr18

Downloading sloan-digital-sky-survey-dr18.zip to /content
 75% 11.0M/14.7M [00:00<00:00, 19.6MB/s]
100% 14.7M/14.7M [00:00<00:00, 15.8MB/s]


In [None]:
! unzip sloan-digital-sky-survey-dr18.zip

Archive:  sloan-digital-sky-survey-dr18.zip
  inflating: SDSS_DR18.csv           


In [None]:
def read_function(csv_file):
    return pd.read_csv(csv_file)

raw_data = read_function('SDSS_DR18.csv')


##Explore/Analyze the Data
1) Obtain info about the training / testing set.

2) Visulize the data.

In [None]:
def show_type(data_frame):
  # obtain data_types
  data_types = data_frame.dtypes.astype(str)

  fig = go.Figure(data=[go.Table(
    header=dict(values=['Column Name', 'Data Type']),
    cells=dict(values=[data_types.index, data_types.values]))])

  # Customize the table layout
  fig.update_layout(
      title='Data Types of DataFrame Columns',
  )

  # Show the plot
  fig.show()

show_type(raw_data)

In [None]:
def show_first_n_features(data_frame, n):
  # Select the first n features (columns)
  selected_columns = data_frame.iloc[:, :n]

  # Create a table using Plotly
  fig = go.Figure(data=[go.Table(
      header=dict(values=selected_columns.columns),
      cells=dict(values=[selected_columns[col] for col in selected_columns.columns]))
  ])

  # Customize the table layout
  fig.update_layout(
      title=f'First {n} Features of DataFrame',
  )

  # Show the plot
  fig.show()

show_first_n_features(raw_data, 5)

In [None]:
# Function to display duplicates in a table
def show_duplicates(df):

  if df.duplicated().sum() == 0:
    print(f'Number of Duplicates:\n {df.duplicated().sum()}')

  else:
    duplicates_data = []

    for column in df.columns:
      duplicated_values = df[df.duplicated(subset=column, keep=False)]
      duplicated_counts = duplicated_values[column].value_counts()

      for value, count in duplicated_counts.items():
        duplicates_data.append([column, value, count])

    duplicates_df = pd.DataFrame(duplicates_data, columns=["Feature", "Duplicated Value", "Count"])

    # Create a table using Plotly
    fig = go.Figure(data=[go.Table(
      header=dict(values=duplicates_df.columns),
      cells=dict(values=[duplicates_df[col] for col in duplicates_df.columns]))
    ])

    # Customize the table layout
    fig.update_layout(
      title='Duplicate Values in DataFrame',
    )

    # Show the table
    fig.show()

show_duplicates(raw_data)

Number of Duplicates:
 0


In [None]:
def show_missing(data_frame, min, max):
  # Calculate the missing values in the DataFrame
  missing_values = data_frame.isna().sum()

  # Create a heatmap
  fig = go.Figure(data=go.Heatmap(
    z=[missing_values.values],  # Provide the missing values as the heatmap data
    x=missing_values.index,     # Feature names as x-axis
    y=["Missing Values"],      # Label for y-axis
    colorscale='Turbo',      # Choose a colorscale (you can customize it)
    zmin=min,                      # Set the minimum value for the color scale to 0
    zmax=max
  ))

  # Add some styling
  fig.update_layout(
    title="Missing Data Heatmap",
    xaxis_title="Features",
    yaxis_title="",
    xaxis_showticklabels=True,
    yaxis_showticklabels=False,
  )

  # Show the plot
  fig.show()


show_missing(raw_data, 0, 100000)

In [None]:
def exp_graph_data(data_frame, target_col_name=None):

  print(f"Data shape: {data_frame.shape}\n")

  print(f'Column Names: {list(data_frame.columns)}\n')

  if target_col_name:
    class_counts = data_frame[target_col_name].value_counts()

    print(f'Label Count:\n{class_counts}')

    fig = go.Figure(go.Bar(x=class_counts.index,
                           y=class_counts.values))

    fig.update_layout(xaxis_title_text='Classes',
                      yaxis_title_text='Count',
                      title_text='Count of Each Class')
    fig.show()

exp_graph_data(raw_data, 'class')

Data shape: (100000, 43)

Column Names: ['objid', 'specobjid', 'ra', 'dec', 'u', 'g', 'r', 'i', 'z', 'run', 'rerun', 'camcol', 'field', 'plate', 'mjd', 'fiberid', 'petroRad_u', 'petroRad_g', 'petroRad_i', 'petroRad_r', 'petroRad_z', 'petroFlux_u', 'petroFlux_g', 'petroFlux_i', 'petroFlux_r', 'petroFlux_z', 'petroR50_u', 'petroR50_g', 'petroR50_i', 'petroR50_r', 'petroR50_z', 'psfMag_u', 'psfMag_r', 'psfMag_g', 'psfMag_i', 'psfMag_z', 'expAB_u', 'expAB_g', 'expAB_r', 'expAB_i', 'expAB_z', 'redshift', 'class']

Label Count:
GALAXY    52343
STAR      37232
QSO       10425
Name: class, dtype: int64


There is a large data imbalance among classes. I will use class weighting to address this issue.

Data: [100000 rows, 43 columns]

In [None]:
# Feature Importance
def tree_classifer(data_frame, target, num_desired_features):
  # create X and y varialbles
  y = data_frame[target]
  X = data_frame.drop(columns=target)

  classifer = ExtraTreesClassifier()
  classifer.fit(X,y)
  print(classifer.feature_importances_)

  #plot graph of feature importances for better visualization
  feat_importances = pd.Series(classifer.feature_importances_, index=X.columns)
  feat_importances = feat_importances.nlargest(num_desired_features)

  fig = px.bar(feat_importances, orientation='h', labels={'index': 'Feature', 'value': 'Importance'},
                 title='Top Feature Importances')

  fig.show()

tree_classifer(raw_data, 'class', 15)

[0.         0.0716005  0.00212436 0.00211491 0.0117678  0.03401094
 0.02558056 0.02103085 0.01947165 0.00355749 0.         0.00121927
 0.00186963 0.06717241 0.08178828 0.00256891 0.01143741 0.02629387
 0.01317935 0.04767788 0.01770543 0.00592643 0.01116374 0.01305604
 0.0087476  0.00919395 0.00946205 0.02053279 0.01139758 0.01790227
 0.01053622 0.09242216 0.05269663 0.05840401 0.02231045 0.01434033
 0.02131344 0.01749043 0.01966498 0.01176021 0.01092835 0.09857885]


redshift appears to have the hightest feature importance