Feature preperation for the models including train/val/test split, missing value imputation and normalization

In [26]:
import pandas as pd
import numpy as np
panda_set_compleet = pd.read_csv("Data/dataset_with_0.csv")

#drop day 0 (contains many missing values)
panda_set_compleet = panda_set_compleet[panda_set_compleet.TransitionDaysInMilk !=0]

#shuffle cows
np.random.seed(89)
grouped = panda_set_compleet.groupby(['AnimalEartag', 'PaperRecordedCalvingDate'])
a=np.arange(grouped.ngroups)
np.random.shuffle(a)
panda_set_shuffeld = panda_set_compleet[grouped.ngroup().isin(a[:])]


We cannot use random way of splitting dataset into train and test as the sequence of events is important for time series.
#So let us take first 60% values = 365 cows of 609 for train and the remaining 40% for testing and validation 122 cows each 
# split into train and test sets

In [27]:
#define set size
train_size = 365*21
test_size = 122*21+365*21
validation_size = 122*21
#split train, val and test
train_set, test_set, validation_set = panda_set_shuffeld.iloc[0:train_size,:], panda_set_shuffeld.iloc[train_size:test_size,:], panda_set_shuffeld.iloc[test_size:len(panda_set_shuffeld),:]

Normalization

In [None]:
columns_to_select = ["TransitionDaysInMilk", "WalkingTimeMinutesPerDay", "EatingBoutLengthMinutesPerBout", "EatingInterBoutLengthMinutes", "EatingNumberOfBoutsPerDay", "EatingTimeMinutesPerDay", "InactiveBoutLengthMinutesPerDay", "InactiveBoutsPerDay", "InactiveInterboutLengthMinutesPerDay", "InactiveTimeMinutesPerDay", "LegActivityStepsPerDay", "LyingBoutLengthMinutesPerDay","LyingBoutsPerDay", "LyingTimeMinutesPerDay", "RuminationBoutLengthMinutesPerBout", "RuminationInterBoutLengthMinutes", "RuminationNumberOfBoutsPerDay","RuminationTimeMinutesPerDay", "StandingTimeMinutesPerDay", "StandupsPerDay"]

#define normalisation function
def norm_to_zero_one(df):
     return (df - df.min()) * 1.0 / (df.max() - df.min())

  
transdays = train_set["TransitionDaysInMilk"]
train_set.loc[:, columns_to_select]  = train_set[columns_to_select].apply(norm_to_zero_one)

train_set.loc[:,"TransitionDaysInMilk"] = transdays

transdays_test = test_set["TransitionDaysInMilk"]
test_set.loc[:, columns_to_select] = test_set[columns_to_select].apply(norm_to_zero_one)
test_set.loc[:,"TransitionDaysInMilk"] = transdays_test

transdays_validatie = validation_set["TransitionDaysInMilk"]
validation_set.loc[:,columns_to_select] = validation_set[columns_to_select].apply(norm_to_zero_one)
validation_set.loc[:,"TransitionDaysInMilk"] = transdays_validatie


Missing value imputation, imputing the mean value for each behaviour for each day before calving

In [28]:
from sklearn.base import BaseEstimator, TransformerMixin
feature_names =["WalkingTimeMinutesPerDay", "EatingBoutLengthMinutesPerBout", "EatingInterBoutLengthMinutes", "EatingNumberOfBoutsPerDay", "EatingTimeMinutesPerDay", "InactiveBoutLengthMinutesPerDay", "InactiveBoutsPerDay", "InactiveInterboutLengthMinutesPerDay", "InactiveTimeMinutesPerDay", "LegActivityStepsPerDay", "LyingBoutLengthMinutesPerDay","LyingBoutsPerDay", "LyingTimeMinutesPerDay", "RuminationBoutLengthMinutesPerBout", "RuminationInterBoutLengthMinutes", "RuminationNumberOfBoutsPerDay","RuminationTimeMinutesPerDay", "StandingTimeMinutesPerDay", "StandupsPerDay"]

class WithinGroupMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_var):
        self.group_var = group_var
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        # the copy leaves the original dataframe intact
        X_ = X.copy()
        for col in X_.columns:
            if X_[col].dtypes == 'float64':
                X_.loc[(X[col].isna()) & X_[self.group_var].notna(), col] = X_[self.group_var].map(X_.groupby(self.group_var)[col].mean())
                X_[col] = X_[col].fillna(X_[col].mean())
        return X_
      
imp = WithinGroupMeanImputer(group_var='TransitionDaysInMilk')

imputed_train_set = imp.fit(train_set[feature_names])

imputed_train_set = imp.transform(train_set[["WalkingTimeMinutesPerDay", "EatingBoutLengthMinutesPerBout", "EatingInterBoutLengthMinutes", "EatingNumberOfBoutsPerDay", "EatingTimeMinutesPerDay", "InactiveBoutLengthMinutesPerDay", "InactiveBoutsPerDay", "InactiveInterboutLengthMinutesPerDay", "InactiveTimeMinutesPerDay", "LegActivityStepsPerDay", "LyingBoutLengthMinutesPerDay","LyingBoutsPerDay", "LyingTimeMinutesPerDay", "RuminationBoutLengthMinutesPerBout", "RuminationInterBoutLengthMinutes", "RuminationNumberOfBoutsPerDay","RuminationTimeMinutesPerDay", "StandingTimeMinutesPerDay", "StandupsPerDay", 'TransitionDaysInMilk']])
Imputed_train_set = imputed_train_set.join(train_set[['AnimalEartag', 'PaperRecordedCalvingDate', 'Parity', 'CalciumDaysInMilk', 'Cut_Off', 'CalvingSeason', 'Calciumcluster', 'FirstLocomotionScore', 'FirstBCSScore']])


In [29]:
import sklearn
#save data frames

imputed_validation_set = imp.transform(validation_set[["WalkingTimeMinutesPerDay", "EatingBoutLengthMinutesPerBout", "EatingInterBoutLengthMinutes", "EatingNumberOfBoutsPerDay", "EatingTimeMinutesPerDay", "InactiveBoutLengthMinutesPerDay", "InactiveBoutsPerDay", "InactiveInterboutLengthMinutesPerDay", "InactiveTimeMinutesPerDay", "LegActivityStepsPerDay", "LyingBoutLengthMinutesPerDay","LyingBoutsPerDay", "LyingTimeMinutesPerDay", "RuminationBoutLengthMinutesPerBout", "RuminationInterBoutLengthMinutes", "RuminationNumberOfBoutsPerDay","RuminationTimeMinutesPerDay", "StandingTimeMinutesPerDay", "StandupsPerDay", 'TransitionDaysInMilk']])
Imputed_validation_set = imputed_validation_set.join(validation_set[['AnimalEartag', 'PaperRecordedCalvingDate', 'Parity', 'CalciumDaysInMilk', 'Cut_Off', 'CalvingSeason', 'Calciumcluster', 'FirstLocomotionScore', 'FirstBCSScore']])
imputed_test_set = imp.transform(test_set[["WalkingTimeMinutesPerDay", "EatingBoutLengthMinutesPerBout", "EatingInterBoutLengthMinutes", "EatingNumberOfBoutsPerDay", "EatingTimeMinutesPerDay", "InactiveBoutLengthMinutesPerDay", "InactiveBoutsPerDay", "InactiveInterboutLengthMinutesPerDay", "InactiveTimeMinutesPerDay", "LegActivityStepsPerDay", "LyingBoutLengthMinutesPerDay","LyingBoutsPerDay", "LyingTimeMinutesPerDay", "RuminationBoutLengthMinutesPerBout", "RuminationInterBoutLengthMinutes", "RuminationNumberOfBoutsPerDay","RuminationTimeMinutesPerDay", "StandingTimeMinutesPerDay", "StandupsPerDay", 'TransitionDaysInMilk']])
Imputed_test_set = imputed_test_set.join(test_set[['AnimalEartag', 'PaperRecordedCalvingDate', 'Parity', 'CalciumDaysInMilk', 'Cut_Off', 'CalvingSeason', 'Calciumcluster', 'FirstLocomotionScore', 'FirstBCSScore']])

Imputed_validation_set.to_csv("Data/Imputed_validation_set.csv")
Imputed_test_set.to_csv("Data/Imputed_test_set.csv")
Imputed_train_set.to_csv("Data/Imputed_train_set.csv")

Upsampling of the train set 

In [30]:
unique_train = train_set[['AnimalEartag', 'PaperRecordedCalvingDate', 'Calciumcluster', 'CalciumValue']].groupby(['AnimalEartag', 'PaperRecordedCalvingDate']).first()
count_class_0, count_class_1 = unique_train['Calciumcluster'].value_counts()
#devide per class
cluster_0 = unique_train[unique_train.Calciumcluster==0]
cluster_1 = unique_train[unique_train.Calciumcluster==1]
#upsampling
cluster_1_over = cluster_1.sample(count_class_0, replace=True)
upsampled_set = pd.concat([cluster_1_over, cluster_0], axis=0)
#random shuffeling to avoid bias due to chronological order
upsampled_set = upsampled_set.sample(frac=1).reset_index(drop=False)
samplenumber = [*range(0,534)]
upsampled_set['SampleNumber'] = samplenumber
#re-adding features 
Upsampled_train_set = pd.merge(upsampled_set, panda_set_compleet, on = ['AnimalEartag', 'PaperRecordedCalvingDate', 'Calciumcluster', 'CalciumValue'], how = 'left')

# Imputed_train_set.to_csv("Data/Imputed_train_set.csv")

ValueError: Length of values (532) does not match length of index (534)

In [None]:
#normalisation 
transdays = Upsampled_train_set["TransitionDaysInMilk"]
Upsampled_train_set.loc[:,columns_to_select] = Upsampled_train_set[columns_to_select].apply(norm_to_zero_one)
Upsampled_train_set["TransitionDaysInMilk"] = transdays

Upsampled_train_set.to_csv("Data/Upsampled_train_set.csv")

In [None]:
#missing value imputation
from sklearn.base import BaseEstimator, TransformerMixin
feature_names =["WalkingTimeMinutesPerDay", "EatingBoutLengthMinutesPerBout", "EatingInterBoutLengthMinutes", "EatingNumberOfBoutsPerDay", "EatingTimeMinutesPerDay", "InactiveBoutLengthMinutesPerDay", "InactiveBoutsPerDay", "InactiveInterboutLengthMinutesPerDay", "InactiveTimeMinutesPerDay", "LegActivityStepsPerDay", "LyingBoutLengthMinutesPerDay","LyingBoutsPerDay", "LyingTimeMinutesPerDay", "RuminationBoutLengthMinutesPerBout", "RuminationInterBoutLengthMinutes", "RuminationNumberOfBoutsPerDay","RuminationTimeMinutesPerDay", "StandingTimeMinutesPerDay", "StandupsPerDay"]
class WithinGroupMeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self, group_var):
        self.group_var = group_var
    
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        # the copy leaves the original dataframe intact
        X_ = X.copy()
        for col in X_.columns:
            if X_[col].dtypes == 'float64':
                X_.loc[(X[col].isna()) & X_[self.group_var].notna(), col] = X_[self.group_var].map(X_.groupby(self.group_var)[col].mean())
                X_[col] = X_[col].fillna(X_[col].mean())
        return X_
 

imp = WithinGroupMeanImputer(group_var='TransitionDaysInMilk')
parity =Upsampled_train_set['Parity']
imputed_train_set_up = imp.fit(Upsampled_train_set[feature_names])

imputed_train_set_up = imp.transform(Upsampled_train_set[["WalkingTimeMinutesPerDay", "EatingBoutLengthMinutesPerBout", "EatingInterBoutLengthMinutes", "EatingNumberOfBoutsPerDay", "EatingTimeMinutesPerDay", "InactiveBoutLengthMinutesPerDay", "InactiveBoutsPerDay", "InactiveInterboutLengthMinutesPerDay", "InactiveTimeMinutesPerDay", "LegActivityStepsPerDay", "LyingBoutLengthMinutesPerDay","LyingBoutsPerDay", "LyingTimeMinutesPerDay", "RuminationBoutLengthMinutesPerBout", "RuminationInterBoutLengthMinutes", "RuminationNumberOfBoutsPerDay","RuminationTimeMinutesPerDay", "StandingTimeMinutesPerDay", "StandupsPerDay", 'TransitionDaysInMilk']])
imputed_train_set_up = imputed_train_set_up.join(Upsampled_train_set[['AnimalEartag', 'PaperRecordedCalvingDate', 'SampleNumber', 'CalciumDaysInMilk', 'Cut_Off', 'CalvingSeason', 'Calciumcluster', 'FirstLocomotionScore', 'FirstBCSScore']])
imputed_train_set_up['Parity'] = parity

In [None]:
#save upsampled train set 
imputed_train_set_up.to_csv("Data/imputed_train_set_up.csv")