# Spectral Analysis Project

The aim of this project is to explore how information from galactic spectra can be extracted using machine learning and how it can determine important physical quantities.


In [68]:
# Importing libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

## EDA - Exploratory data analysis 

### The dataset 

Downolading the raw dataset I created and downloaded with the CasJobs interface as a csv file. 
The dataset is not filtered and cleaned, and so we proceed with the cleaning and the EDA.

In [69]:
# Converting the csv into a pandas dataframe 
raw_df = pd.read_csv('/Users/emmatosato/Documents/UNI/Erasmus/Data Mining and Machine Learning/Spectral-AnalysisProject/SDSS_dataset.csv')

### Dataset insights

In [70]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 704 entries, objID to img
dtypes: float64(557), int64(132), object(15)
memory usage: 53.7+ MB


In [71]:
raw_df.head(10)

Unnamed: 0,objID,skyVersion,run,rerun,camcol,field,obj,mode,nChild,type,...,elodieLogG,elodieFeH,elodieZ,elodieZErr,elodieZModelErr,elodieRChi2,elodieDOF,Column9,Column10,img
0,1237648720687595531,2,756,301,2,333,11,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472967166424,5067,System.Byte[]
1,1237648720687595593,2,756,301,2,333,73,1,0,6,...,4.0,-1.5,0.000523,1.8e-05,4e-06,0.797908,2165,10472974541586,5067,System.Byte[]
2,1237648720687595633,2,756,301,2,333,113,1,0,6,...,4.0,-1.5,-7e-05,5.9e-05,8e-06,0.97788,2152,10472972453787,5067,System.Byte[]
3,1237648720687923340,2,756,301,2,338,140,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10471425089211,5067,System.Byte[]
4,1237648720687595640,2,756,301,2,333,120,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472963931638,5067,System.Byte[]
5,1237648720687136830,2,756,301,2,326,62,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10473104907932,5067,System.Byte[]
6,1237648720687136927,2,756,301,2,326,159,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10473099457615,5067,System.Byte[]
7,1237648720687202335,2,756,301,2,327,31,1,0,6,...,4.0,-1.0,0.00015,8e-06,3e-06,0.702489,2159,10473099199694,5067,System.Byte[]
8,1237648720687661218,2,756,301,2,334,162,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472776162136,5067,System.Byte[]
9,1237648720687661278,2,756,301,2,334,222,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472787867763,5067,System.Byte[]


In [72]:
raw_df.tail(10)

Unnamed: 0,objID,skyVersion,run,rerun,camcol,field,obj,mode,nChild,type,...,elodieLogG,elodieFeH,elodieZ,elodieZErr,elodieZModelErr,elodieRChi2,elodieDOF,Column9,Column10,img
9990,1237648720687333542,2,756,301,2,329,166,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472837724084,5067,System.Byte[]
9991,1237648720687399024,2,756,301,2,330,112,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472832463013,5067,System.Byte[]
9992,1237648720687399095,2,756,301,2,330,183,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472945110334,5067,System.Byte[]
9993,1237648720687399214,2,756,301,2,330,302,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472952059910,5067,System.Byte[]
9994,1237648720687464613,2,756,301,2,331,165,1,0,6,...,4.2,-1.24,0.00025,5.3e-05,8e-06,0.86414,2153,10472952750223,5067,System.Byte[]
9995,1237648720687530082,2,756,301,2,332,98,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10473006941373,5067,System.Byte[]
9996,1237648720687530114,2,756,301,2,332,130,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472996121957,5067,System.Byte[]
9997,1237648720687530144,2,756,301,2,332,160,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10472964901368,5067,System.Byte[]
9998,1237648720687792309,2,756,301,2,336,181,1,0,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10474940996711,5067,System.Byte[]
9999,1237648720687792312,2,756,301,2,336,184,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0,10474955154908,5067,System.Byte[]


### Null values 

In [73]:
# Check the NaN values in the datset
v = raw_df.isnull().sum()

# Mask for finding the non zero values in the v vector crated (so the columns which contain NaN values)
non_zero_mask = [value != 0 for value in v]
v[non_zero_mask]

run1d              10000
subClass            5301
class_noqso        10000
subClass_noqso     10000
comments_person    10000
tFile                 11
elodieFileName      9140
elodieObject        9140
elodieSpType        9140
dtype: int64

I will drop the run1d, class_noqso, subClass_noqso and comments_person because all the rows are NaN values. I will delete also the subClass, elodieFileName, elodieObject and elodieSpType because the contain too many null values too.

The tFile attribute has only 11 NaN values, so i will directly drop the 11 rows that have this attribute null. Since i have a large dataset, this action should not influence the analysis.  

In [74]:
# Dropping the columns
df = raw_df.copy()
df.drop(columns=['run1d', 'subClass', 'class_noqso', 'subClass_noqso', 'comments_person', 'elodieFileName', 'elodieObject', 'elodieSpType'], inplace = True)

# Dropping the rows
#df = raw_df.dropna()

In [75]:
# Cheking if everthing worked well
v = df.isnull().sum()
non_zero_mask = [value != 0 for value in v]
v[non_zero_mask]

tFile    11
dtype: int64

### Duplicated and bad data 

The column "objID" is the Unique SDSS identifier composed from [skyVersion,rerun,run,camcol,field,obj]. We use this attribute in order to check if we have duplicated object in the dataset.

In [76]:
df[df.duplicated(subset='objID')]


Unnamed: 0,objID,skyVersion,run,rerun,camcol,field,obj,mode,nChild,type,...,elodieLogG,elodieFeH,elodieZ,elodieZErr,elodieZModelErr,elodieRChi2,elodieDOF,Column9,Column10,img


In [81]:
df[df['sciencePrimary'] > 0 ]

Unnamed: 0,objID,skyVersion,run,rerun,camcol,field,obj,mode,nChild,type,...,elodieLogG,elodieFeH,elodieZ,elodieZErr,elodieZModelErr,elodieRChi2,elodieDOF,Column9,Column10,img
0,1237648720687595531,2,756,301,2,333,11,1,0,3,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0,10472967166424,5067,System.Byte[]
1,1237648720687595593,2,756,301,2,333,73,1,0,6,...,4.0,-1.5,0.000523,0.000018,0.000004,0.797908,2165,10472974541586,5067,System.Byte[]
2,1237648720687595633,2,756,301,2,333,113,1,0,6,...,4.0,-1.5,-0.000070,0.000059,0.000008,0.977880,2152,10472972453787,5067,System.Byte[]
3,1237648720687923340,2,756,301,2,338,140,1,0,3,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0,10471425089211,5067,System.Byte[]
4,1237648720687595640,2,756,301,2,333,120,1,0,3,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0,10472963931638,5067,System.Byte[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1237648720687530082,2,756,301,2,332,98,1,0,3,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0,10473006941373,5067,System.Byte[]
9996,1237648720687530114,2,756,301,2,332,130,1,0,3,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0,10472996121957,5067,System.Byte[]
9997,1237648720687530144,2,756,301,2,332,160,1,0,3,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0,10472964901368,5067,System.Byte[]
9998,1237648720687792309,2,756,301,2,336,181,1,0,6,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0,10474940996711,5067,System.Byte[]
