In [12]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def load_and_clean_data(file_path):
    # Load the dataset, skipping initial rows that might be metadata
    data = pd.read_csv(file_path, skiprows=30)
    
    # Verify and assign column names dynamically if possible
    expected_columns = ['Index', 'Object Name', 'Redshift', 'Method', 'Distance Modulus', 
                        'Error', 'Reference Code', 'Notes', 'Luminosity', 'Measurement Type', 
                        'Redshift Source', 'Distance', 'Distance Error', 'Measurement Note1', 'Measurement Note2']
    if len(data.columns) >= len(expected_columns):
        data.columns = expected_columns
    else:
        raise ValueError("Unexpected number of columns in the dataset.")
    
    # Filter the relevant columns and drop missing values
    filtered_data = data[['Redshift', 'Distance']].dropna()
    
    # Convert data types to float
    filtered_data['Redshift'] = filtered_data['Redshift'].astype(float)
    filtered_data['Distance'] = filtered_data['Distance'].astype(float)
    
    # Remove duplicates
    filtered_data = filtered_data.drop_duplicates(subset=['Redshift', 'Distance'])
    
    # Remove outliers based on domain knowledge
    filtered_data = filtered_data[(filtered_data['Redshift'] > 0) & (filtered_data['Redshift'] < 10)]
    filtered_data = filtered_data[filtered_data['Distance'] < 20000]  # Example threshold
    
    # Handle unit consistency (example conversion if needed)
    # Assuming distance is in parsecs, convert to Megaparsecs
    # filtered_data['Distance'] = filtered_data['Distance'] / 1e6
    
    # Feature scaling
    scaler = StandardScaler()
    filtered_data[['Redshift', 'Distance']] = scaler.fit_transform(filtered_data[['Redshift', 'Distance']])
    
    return filtered_data

# Specify the path to your CSV file
file_path = '/Users/shauryachaturvedi/Desktop/ML_Ass4/ML_ASTR8004/NED_data.csv'
cleaned_data = load_and_clean_data(file_path)
cleaned_data.to_csv('cleaned_NED_data.csv', index=False)

# Display the first few rows of the cleaned and filtered data
print(cleaned_data.head())

# Exploratory Data Analysis
sns.scatterplot(x='Redshift', y='Distance', data=cleaned_data)
plt.title('Redshift vs. Distance')
plt.xlabel('Redshift (z) [Standardized]')
plt.ylabel('Distance (Mpc) [Standardized]')
plt.show()

# Split the data
X = cleaned_data[['Redshift']].values
y = cleaned_data['Distance'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


   Redshift  Distance
0       7.0      70.0
1       7.0      70.0
2       7.0      70.0
3       7.0      70.0
4       7.0      70.0


In [14]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define the model
model = Sequential([
    Dense(64, activation='relu', input_shape=[1]),  # First hidden layer with input shape 1 (redshift)
    Dense(64, activation='relu'),  # Second hidden layer
    Dense(1)  # Output layer: Predicting 1 value (distance)
])

# Compile the model
model.compile(optimizer='adam',
              loss='mse',  # Mean Squared Error is commonly used for regression problems
              metrics=['mae'])  # Mean Absolute Error as an additional metric

# Model summary to check the structure
model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
