<a href="https://colab.research.google.com/github/ChrysKoum/NASA-Space-Apps-Challenge-2023/blob/main/Programs/Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

def clean_and_impute(input_file_path, output_file_path):
    # Read the data into a pandas dataframe
    df = pd.read_csv(input_file_path, delim_whitespace=True, header=None)
    df.columns = ['Year', 'Day', 'Hour' ,'Minute','Bx','By','Bz','Flow_Speed', 'Proton_Density','Temperature']
    # Identify and replace placeholder values with NaN
    placeholders = [9999.99, 9999.99, 99999.9, 9999999.0, 1000.0]
    df.replace(placeholders, np.nan, inplace=True)

    # Impute missing data using linear interpolation
    df.interpolate(method='linear', inplace=True, limit_direction='forward')

    # Save the imputed data
    df.to_csv(output_file_path, sep='\t', index=False, float_format='%.1f')

    print(f"Cleaned and imputed data has been saved to: {output_file_path}")

# Provide your file paths here
input_file_path = "wind_def_2022.txt"
output_file_path = "wind_def_2022_clean.txt"
clean_and_impute(input_file_path, output_file_path)


Cleaned and imputed data has been saved to: wind_def_2022_clean_month_plus.txt


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

def day_of_year_to_month_day(day, year):
    date = datetime(year, 1, 1) + pd.DateOffset(days=day - 1)
    return date.month, date.day

def clean_and_impute(input_file_path, output_file_path):
    # Read the data into a pandas dataframe
    df = pd.read_csv(input_file_path, delim_whitespace=True)

    # Extract year from the data (assuming the year is in the first column)
    year = df['Year'].iloc[0]

    # Compute the month and day from the day_of_year
    df['Month'], df['day'] = zip(*df['Day'].apply(lambda x: day_of_year_to_month_day(x, year)))

    # Insert the month column after the Year column
    #df.insert(1, df['Month'])

    # Drop the temporary month and day columns
    df.drop([ 'Day'], axis=1, inplace=True)

    # Save the imputed data
    df.to_csv(output_file_path, sep='\t', index=False, float_format='%.1f')

    print(f"Cleaned and imputed data has been saved to: {output_file_path}")

# Provide your file paths here
input_file_path = "wind_def_2022_clean.txt"
output_file_path = "wind_def_2022_clean_month.txt"
clean_and_impute(input_file_path, output_file_path)


Cleaned and imputed data has been saved to: wind_def_2022_clean_month.txt


In [None]:
import pandas as pd
import numpy as np

def normilize(input_file_path, output_file_path):
  # Read the data into a pandas dataframe
  df = pd.read_csv(input_file_path, delim_whitespace=True)

  # Format columns with leading zeros
  df['day'] = df['day'].apply(lambda x: f'{x:02}')  # Format second column (days)
  df['Month'] = df['Month'].apply(lambda x: f'{x:02}')  # Format third column (hours)
  df['Hour'] = df['Hour'].apply(lambda x: f'{x:02}')  # Format fourth column (minutes)
  df['Minute'] = df['Minute'].apply(lambda x: f'{x:02}')  # Format fourth column (minutes)

  # Round the values to one decimal place
  df = df.round(1)


  # Save the imputed data
  df.to_csv(output_file_path, sep='\t', index=False, float_format='%.1f')

  print(f"Cleaned and imputed data has been saved to: {output_file_path}")

# Provide your file paths here
input_file_path = "wind_def_2019_clean_month.txt"
output_file_path = "wind_def_2019_clean_month_sanitize.txt"
normilize(input_file_path, output_file_path)

Cleaned and imputed data has been saved to: ace_min_b2016_clean_month_sanitize.txt
