In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tabulate import tabulate

In [5]:
df = pd.read_csv("Datasets/babyboom.csv")
df.columns = ["TimeOfBirth", "Sex", "WeightInGrams", "MinutesAfterMidnight"]

In [None]:
# Build histograms for every variable
df.hist(bins=50, figsize=(12,8))

In [None]:
# Build boxplots for every variable
columns = df.columns
fig, axes = plt.subplots(nrows=len(columns), ncols=1, figsize=(10, 8))

for i in range(len(columns)):
    axes[i].boxplot(
        df[columns[i]], vert=False, patch_artist=True,
        boxprops=dict(facecolor="lightblue", color="blue"),
        medianprops=dict(color="red"),
        whiskerprops=dict(color="blue"),
        capprops=dict(color="blue"),
        flierprops=dict(markerfacecolor="orange", marker="o", markersize=5),
    )
    axes[i].set_title(columns[i])
    axes[i].grid(True, axis="x", linestyle="--")

plt.tight_layout()
plt.show()

In [None]:
# Numerical characteristics of all variables
table = pd.DataFrame({
    "Mean": df.mean(),
    "Variance": df.var(),
    "Deviation": df.std(),
    "Median": df.median(),
    "Q1": df.quantile(0.25),
    "Q3": df.quantile(0.75),
})
table.columns = ["Mean", "Variance", "Deviation", "Median", "Q1", "Q3"]
print(tabulate(table, headers="keys",  tablefmt="fancy_grid"))

In [None]:
# Coefficients of correlation between variables
correlations = df.select_dtypes(include=["number"]).corr()
print(tabulate(correlations, headers="keys", tablefmt="fancy_grid"))