<a href="https://colab.research.google.com/github/A-Grossmann/Infant_Sleep_Analysis/blob/main/cleanning_baby_data_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import Data from the sleeping times and duration data

import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime as dt
from datetime import timedelta
from sklearn.cluster import MeanShift
import seaborn as sns
import matplotlib.cm as cm
from matplotlib.patches import FancyBboxPatch
import matplotlib as mpl
import datetime
import pytz


'''
  The first section is meant for combining, processing, and formating data, exploritory data analysis,
feature engineering making rolling 3 hr feeding windows and discreate measurment variables to classify measurments.

  This cell can be run once to create the cvs file for the cleaned data to be used for the animation of the graph.

'''

#designate name of person for file:
name = "Margot"

#Create Generalized file strings:
sleep_csv = f"{name}_sleep.csv"
growth_csv = f"{name}_growth.csv"
expressed_csv = f"{name}_expressed.csv"
formula_csv = f"{name}_formula.csv"
diaper_csv = f"{name}_diaper.csv"
pump_csv = "pump.csv"

#Import the Data for the size

df_sleep = pd.read_csv(sleep_csv)

df_size = pd.read_csv(growth_csv)

df_expressed = pd.read_csv(expressed_csv)

df_formula = pd.read_csv(formula_csv)

df_diaper = pd.read_csv(diaper_csv)

df_pump = pd.read_csv(pump_csv)

#Transform df time column into datetime
df_sleep['Time'] = pd.to_datetime(df_sleep['Time'], format='%m/%d/%y, %I:%M %p')
df_size['Time'] = pd.to_datetime(df_size['Time'], format='%m/%d/%y, %I:%M %p')
df_expressed['Time'] = pd.to_datetime(df_expressed['Time'], format='%m/%d/%y, %I:%M %p')
df_formula['Time'] = pd.to_datetime(df_formula['Time'], format='%m/%d/%y, %I:%M %p')
df_diaper['Time'] = pd.to_datetime(df_diaper['Time'], format='%m/%d/%y, %I:%M %p')
df_pump['Time'] = pd.to_datetime(df_pump['Time'], format='%m/%d/%y, %I:%M %p')

#Merge data sets for sleeping and growth
csvs_combined = pd.concat([df_sleep, df_size, df_expressed, df_formula, df_diaper, df_pump], join = 'outer', ignore_index=True)

# Sort by time column and reset index
csvs_combined = csvs_combined.sort_values('Time').reset_index(drop=True)

def fill_nan_with_last_value(df, col):
    """
  Replaces NaN values in a DataFrame with the last valid value in each column.
  """
    df[col] = df[col].ffill()

    return df

fill_nan_with_last_value(csvs_combined, 'Weight (lbs.)')


#Sum the total amount and fill it in for columns with a duration value
def sum_last_values_for_nan_fill(df,col_1,col_2, col_3):
    df.fillna({col_2: 0}, inplace=True)
    df.fillna({col_1: 0}, inplace=True)
    df["temp_fill"] = 0
    df[col_3] = 0
    for i in df.index.tolist():
        if df.iloc[i][col_1]== 0:
            a = df.iloc[i-1][col_2]
            b = df.iloc[i-1]["temp_fill"]
            c = a + b
            df.loc[i, "temp_fill"] = c
        else:
            df.loc[i, col_3] = df.iloc[i-1]['temp_fill']
    del df['temp_fill']
    return df

csvs_combined = sum_last_values_for_nan_fill(csvs_combined,'Duration(minutes)','Amount (ml)', 'Since_nap (ml)')

def datetime_index(df, time):

    # Convert the column to datetime, coercing errors to NaT
    df[time] = pd.to_datetime(df[time], errors='coerce')

    # Drop rows with NaT values (invalid dates)
    df = df.dropna(subset = [time])

    # Set 'datetime' as index
    df.set_index(time, inplace=True)

    return df


csvs_combined = datetime_index(csvs_combined, "Time")

def rolling_sums(df, time, amount, weight, increments, rate):

    for i in increments:
        r = f"Rolling_{i}_hr_feed"
        w = f"{i}h"
        p = f"{i}hr_projected_feed"
        pm = f"plus_minus_{i}hr_rolling"

        #Calculate rolling sum
        df[r] = df[amount].rolling(window = w).sum()

        #make a projected feed amount column
        df[p] = df[weight] * (i/24) * rate

        #make an plus/minus column
        df[pm] = df[r] - df[p]

    return df

csvs_combined_new_par = rolling_sums(csvs_combined, "Time", 'Weight (lbs.)', 'Amount (ml)', [24, 12, 3], 72)
csvs_combined.head()

#Turn the index into a Time column
csvs_combined_new_par['Time'] = csvs_combined_new_par.index

csvs_combined_new_par.reset_index(drop = True)


def wake_window(df, duration, time):
    df = df.reset_index(drop = True)
    df = df.sort_values(time)
    df['awake_window(minutes)'] = 0
    #NOTE: in this function Time must be a coluumn of the dataframe and not an index


    #remove all zero values for durration
    df_not = df[(df[duration] == 0)]
    df = df[df[duration] !=0]

    #fill in awake windo column with minutes awake
    df['awake_window(minutes)'] = df.loc[:, time].diff().dt.total_seconds() / 60

    #rejoin the dataframe with the original and sort by time

    df = pd.concat([df, df_not])
    df = df.sort_values(time)
    return df

#time of day evening afternoon morning

csvs_combined_new_par = wake_window(csvs_combined_new_par, 'Duration(minutes)', 'Time')


#make a time of day catagorical variable where evening 18 - 24, Night 0 - 6, Morning 6 - 12, Afternoon 12 - 18
def categorize_time(time):
  if time.hour >= 5 and time.hour < 12:
    return 'Morning'
  elif time.hour >= 12 and time.hour < 17:
    return 'Afternoon'
  elif time.hour >= 17 and time.hour < 22:
    return 'Evening'
  else:
    return 'Night'

# Create the new categorical column
csvs_combined_new_par['time_of_day'] = csvs_combined_new_par['Time'].apply(categorize_time)
#csvs_combined_new_par.head()

#Insert measurment hot code: sleep, feed, diaper, pump

def classify_measurment(df, var, col_1, col_2, col_3, col_4):
    for i in df.index.tolist():
        if df.loc[i, col_1] !=0:
            df.loc[i, 'measurment'] = var[0]
        elif df.loc[i, col_2] !=0:
            df.loc[i, 'measurment'] = var[1]
        elif pd.isnull(df.at[i, col_3]) == False:
            df.loc[i, 'measurment'] = var[2]
        elif pd.isna(df.at[i, col_4]) == False:
            df.loc[i, 'measurment'] = var[3]
        else:
            df.loc[i, 'measurment'] = 0

    return df

var = ['sleep', 'feed', 'diaper', 'pump']

csvs_combined_new_par = classify_measurment(csvs_combined_new_par, var, 'Duration(minutes)', 'Amount (ml)', 'Status', 'Total amount (ml)')

csvs_combined_new_par.head()

#Reorder
csvs_combined_new_par = csvs_combined_new_par[['Time', 'measurment', 'Duration(minutes)', 'awake_window(minutes)', 'time_of_day', 'Weight (lbs.)', 'Since_nap (ml)', 'Rolling_24_hr_feed', '24hr_projected_feed', 'plus_minus_24hr_rolling', 'Rolling_12_hr_feed', '12hr_projected_feed', 'plus_minus_12hr_rolling', 'Rolling_3_hr_feed', '3hr_projected_feed', 'plus_minus_3hr_rolling', 'Amount (ml)', 'Status','Total amount (ml)', 'Total Duration (min)' ]]

# Specify the file path and name
file_path = f"{name}_cleaned.csv"


# Export the DataFrame to the CSV file
#csvs_combined_new_par.to_csv(file_path, index=False)
csvs_combined_new_par.to_csv(file_path, index=False)