In [13]:
import pandas as pd
import math
import fastparquet
import sqlite3

In [14]:
# Create dataframe with test data
input_data = pd.read_parquet('../test_files/merged_cow_with_feed_daily_20230530_1557.parquet')
# Remove animals missing diet information
clean_data = input_data.dropna(subset=['sampleId'])

In [15]:
# Cleaning Data: Step 1
# Rename all columns so the same variable names are used throughout the model

def fix_column_names(df):
    df = df.rename(columns={
        'lactation_number': 'An_Parity_rl',
        'days_in_milk': 'An_LactDay',
        'MY': 'Trg_MilkProd',
        'BW_smooth': 'An_BW',
        'DMI': 'Dt_DMIn',
        'Fat %': 'Trg_MilkFatp',
        'Protein %': 'Trg_MilkTPp',
        'days_preg': 'An_GestDay',
        'Neutral Detergent Fibre (%)': 'Dt_NDFIn'
    })
    return df

clean_data = fix_column_names(clean_data)
print(clean_data.iloc[:, :6].head(2))


      cow_id  An_Parity_rl       date  An_LactDay  Trg_MilkProd  weight
1658    4921             1 2021-01-25       140.0         27.81   663.0
1659    4823             2 2021-01-25        72.0         31.97   734.0


In [24]:
# Cleaning Data: Step 2
# Move diet information to the database and replace with unique Diet_ID

def assign_diet_id(row):
    return f"{row['sampleId']}_{row['reportDate']}"

def check_diets(df):
    # Create dataframe with each unique diet
    unique_diets = df[['sampleId', 'reportDate']].drop_duplicates()
    unique_diet_list = unique_diets.index.values.tolist()

    # Index numbers in list are off because I deleted the top of the dataframe with missing feed data
    # If I had not then 'diet_data' could be used here instead of 'input_data'
    current_diets = input_data.iloc[unique_diet_list]

    # Removing all non diet data 
    columns_to_remove = ['lactation_number', 'days_in_milk', 'MY', 'weight', 'BW_smooth', 'BW_gain', 'asfed_intake', 'DMI', 'bcs_value', 'Birth Date', 'Test Day Date',
                        'Lact Start Date', 'Fat %', 'Protein %', 'SCC', 'Pregnancy Indicator', 'Days to Last Breeding', 'days_preg', 'conception_date', 'age_m', 'cow_id', 'date', 'DIM_bins_w']
    current_diets = current_diets.drop(columns=columns_to_remove)

    # Add Diet_ID to first column using assign_diet_id fucntion
    new_column = current_diets.apply(assign_diet_id, axis=1)
    current_diets.insert(0, 'Diet_ID', new_column)

    return current_diets

def save_to_database(df, index):
    # Save unique diets to database
    conn = sqlite3.connect('../diet_database.db')
    cursor = conn.cursor()

    df.set_index('index', inplace=True)
    df.to_sql('df', conn, if_exists='replace')

    conn.close()

current_diets = check_diets(clean_data)
print(current_diets.iloc[:, :6].head(4))

save_to_database(current_diets, Diet_ID)

                                                Diet_ID  \
1658          DAIRY - LACTATING TMR_2021-01-25 00:00:00   
2290            DAIRY-LACTATING TMR_2021-02-04 00:00:00   
4039           DARIRY LACTATING TMR_2021-03-05 00:00:00   
5807  DAIRY NON RUMENSIN LACTATING TMR_2021-04-08 00...   

                              sampleId reportDate location  \
1658             DAIRY - LACTATING TMR 2021-01-25  parlour   
2290               DAIRY-LACTATING TMR 2021-02-04  parlour   
4039              DARIRY LACTATING TMR 2021-03-05  parlour   
5807  DAIRY NON RUMENSIN LACTATING TMR 2021-04-08  parlour   

      Acid Detergent Fibre (%)  Ash (%)  
1658                     21.23     6.56  
2290                     21.12     6.48  
4039                     19.68     5.92  
5807                     23.00     6.43  


NameError: name 'Diet_ID' is not defined