In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")


In [2]:
original_df = pd.read_csv("./data/horse.csv")
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

In [3]:
print(f"original data shape : {original_df.shape}")
print(f"train data shape : {train_df.shape}")
print(f"test data shape : {test_df.shape}")

original data shape : (299, 28)
train data shape : (1235, 29)
test data shape : (824, 28)


In [4]:
def summary_df(dataset, df):
    """
        Args:
            dataset ( str ): Name of the dataset
            df ( dataframe ): dataframe

        Returns:
            summary ( dataframe ): summary of the input dataframe with No of missing values, % missing values compared to original dataset, No of unique values, No of non null rows
    """
    summary = pd.DataFrame(df.dtypes, columns=[f'{dataset}_dtypes'])
    summary[f'{dataset}_missing#'] = df.isna().sum()
    summary[f'{dataset}_missing%'] = ((df.isna().sum())/len(df)*100).round(1)
    summary[f'{dataset}_uniques'] = df.nunique().values
    summary[f'{dataset}_count'] = df.count().values
    #summary['skew'] = df.skew().values
    return summary

# Generate the summaries without styling first
train_summary = summary_df("train", train_df)
test_summary = summary_df("test", test_df)
origin_summary = summary_df("origin", original_df)

combined_summary = pd.concat([train_summary, test_summary, origin_summary], axis=1)

combined_summary.drop(["test_dtypes","origin_dtypes"],axis=1,inplace=True)
combined_summary.rename({"train_dtypes":"dtypes"},axis=1,inplace=True)
combined_summary.drop('id',axis=0,inplace=True)
combined_summary

Unnamed: 0,dtypes,train_missing#,train_missing%,train_uniques,train_count,test_missing#,test_missing%,test_uniques,test_count,origin_missing#,origin_missing%,origin_uniques,origin_count
surgery,object,0,0.0,2,1235,0.0,0.0,2.0,824.0,0.0,0.0,2.0,299.0
age,object,0,0.0,2,1235,0.0,0.0,2.0,824.0,0.0,0.0,2.0,299.0
hospital_number,int64,0,0.0,255,1235,0.0,0.0,210.0,824.0,0.0,0.0,283.0,299.0
rectal_temp,float64,0,0.0,43,1235,0.0,0.0,34.0,824.0,60.0,20.1,40.0,239.0
pulse,float64,0,0.0,50,1235,0.0,0.0,49.0,824.0,24.0,8.0,52.0,275.0
respiratory_rate,float64,0,0.0,37,1235,0.0,0.0,38.0,824.0,58.0,19.4,40.0,241.0
temp_of_extremities,object,39,3.2,4,1196,35.0,4.2,4.0,789.0,56.0,18.7,4.0,243.0
peripheral_pulse,object,60,4.9,4,1175,47.0,5.7,4.0,777.0,69.0,23.1,4.0,230.0
mucous_membrane,object,21,1.7,6,1214,13.0,1.6,6.0,811.0,47.0,15.7,6.0,252.0
capillary_refill_time,object,6,0.5,3,1229,6.0,0.7,3.0,818.0,32.0,10.7,3.0,267.0


In [13]:

fig = px.line(combined_summary, x=combined_summary.index, y=['origin_missing%', 'test_missing%', 'train_missing%'],
              labels={'x': 'Index', 'value': '% Missing'},
              title='% Missing values in datasets')

fig.update_xaxes(showgrid=False, tickangle=90)
fig.update_yaxes(showgrid=False)

fig.update_layout(title_x=0.5)

fig.update_layout(legend=dict(x=0.02,y=0.9, orientation='h'))

fig.show()

* Train and test dataset are simillar in terms of missing data
* Original dtaaset seems to have 10-15% of missing values in their dataset