In [1]:
import pandas as pd
import math
import fastparquet
import sqlite3

In [2]:
# Create dataframe with test data
input_data = pd.read_parquet('../test_files/merged_cow_with_feed_daily_20230530_1557.parquet')
# Remove animals missing diet information
input_data_dropna = input_data.dropna(subset=['sampleId'])

In [7]:
# Cleaning Data: Step 1
# Move diet information to the database and replace with unique Diet_ID

def check_diets(df):
    # Create dataframe with each unique diet
    unique_diets = df[['sampleId', 'reportDate']].drop_duplicates()
    unique_diet_list = unique_diets.index.values.tolist()

    # Index numbers in list are off because I deleted the top of the dataframe with missing feed data
    # If I had not then 'diet_data' could be used here instead of 'input_data'
    current_diets = input_data.iloc[unique_diet_list]

    # Removing all non diet data 
    columns_to_remove = ['lactation_number', 'days_in_milk', 'MY', 'weight', 'BW_smooth', 'BW_gain', 'asfed_intake', 'DMI', 'bcs_value', 'Birth Date', 'Test Day Date',
                        'Lact Start Date', 'Fat %', 'Protein %', 'SCC', 'Pregnancy Indicator', 'Days to Last Breeding', 'days_preg', 'conception_date', 'age_m', 'cow_id', 'date', 'DIM_bins_w']
    current_diets = current_diets.drop(columns=columns_to_remove)

    # Add Diet_ID
    current_diets = current_diets.assign(
        Diet_ID = lambda df: df['sampleId'] + '_' + df['reportDate'].dt.strftime('%Y-%m-%d')
        )

    return current_diets


def save_to_database(df, table_name, index):
    # Save current diets to database
    conn = sqlite3.connect('../diet_database.db')
    cursor = conn.cursor()

    df.set_index(index, inplace=True)
    df.to_sql(table_name, conn, if_exists='replace')

    conn.close()


current_diets = check_diets(input_data_dropna)
# print(current_diets.iloc[:, :6].head(1))
save_to_database(current_diets,'current_diets', 'Diet_ID')

In [7]:
# Cleaning Data: Step 2 
# Rename columns, add default values, remove unecessary columns, add Diet_ID

def clean_input_data(df):
    # Rename existing columns
    df = df.rename(columns={
        'lactation_number': 'An_Parity_rl',
        'days_in_milk': 'An_LactDay',
        'MY': 'Trg_MilkProd',
        'BW_smooth': 'An_BW',
        'DMI': 'Dt_DMIn',
        'Fat %': 'Trg_MilkFatp',
        'Protein %': 'Trg_MilkTPp',
        'days_preg': 'An_GestDay'
        })
    
    # Add default values
    df['An_BW_mature'] = 700
    df['Trg_FrmGain'] = 0
    df['An_GestLength'] = 280
    df['Fet_BWbrth'] = 44.1
    df['Trg_MilkLacp'] = 4.85
    df['Trg_RsrvGain'] = 0
    df['An_AgeDay'] = df['age_m'] * 30.436875
    
    # Remove diet information
    diet_columns = ['location', 'Acid Detergent Fibre (%)', 'Ash (%)', 'Calcium (%)', 'Copper (ug/g)', 'Crude Fat (%)', 'Crude Protein (%)', 'Dry Matter (%)', 'Iron (ug/g)', 'Magnesium (%)', 'Manganese (ug/g)', 'Moisture (%)', 'NE Gain (MCal/Kg)', 'NE Lactation (MCal/Kg)', 'NE Maintenance (MCal/Kg)', 'NFC (%)', 'Neutral Detergent Fibre (%)', 'Phosphorus (%)', 'Potassium (%)', 'Sodium (%)', 'Starch (%)', 'Sulphur (%)', 'Total Digestible Nutrients (%)', 'Zinc (ug/g)', 'DIM_bins_w']
    df = df.drop(columns = diet_columns)

    # Assign Diet_ID
    df = df.assign(
        Diet_ID = lambda df: df['sampleId'] + '_' + df['reportDate'].dt.strftime('%Y-%m-%d')
        )

    # Calculate NDF intake by getting NDF (% DM) from database
    conn = sqlite3.connect('../diet_database.db')
    query = "SELECT Diet_ID, `Neutral Detergent Fibre (%)` FROM current_diets"
    # create data frame from query:
    df_NDF = pd.read_sql_query(query, conn)
    conn.close()
    # Merge dataframes
    clean_data = pd.merge(
        df,
        df_NDF,
        on = 'Diet_ID'
    )
    # Calculate NDF intake in kg
    clean_data = clean_data.assign(
    Dt_NDFIn = lambda df: df['Neutral Detergent Fibre (%)']/100 * df['Dt_DMIn']
    )

    # Drop unneeded columns
    columns_to_drop = ['date', 'weight', 'asfed_intake', 'bcs_value', 'Birth Date', 'Test Day Date',
                       'Lact Start Date', 'SCC', 'Pregnancy Indicator', 'Days to Last Breeding', 
                       'conception_date', 'age_m', 'sampleId', 'reportDate']
    clean_data = clean_data.drop(columns = columns_to_drop)

    return clean_data


clean_data = clean_input_data(input_data_dropna)
print(clean_data.iloc[:, :6].head(2))

   cow_id  An_Parity_rl  An_LactDay  Trg_MilkProd    An_BW  BW_gain
0    4921             1       140.0         27.81  668.875    1.550
1    4823             2        72.0         31.97  729.982    0.773


In [8]:
from ME_functions import *
from MP_functions import *

In [9]:
# Calculate Requirements for ME and MP

clean_data['ME Requirement'] = clean_data.apply(lambda row: execute_ME_requirement(row), axis = 1)
clean_data['MP Requirement'] = clean_data.apply(lambda row: execute_MP_requirement(row), axis = 1)


In [10]:
# Check for cows missing data

def check_na_requirement(df):    
    columns_to_check = ['ME Requirement', 'MP Requirement']
    check_na = df[columns_to_check].isna().any(axis=1)
    cows_missing_data = df[check_na]
    return cows_missing_data

cows_missing_data = check_na_requirement(clean_data)