In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy import stats

In [2]:
df = pd.read_csv("../toydata/titanic.csv")
print(f"{df.shape}")
df.head(1)

(891, 15)


Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False


### Summary statistics and Summary category information


In [14]:
df.select_dtypes(include=["number"]).describe(
    percentiles=[
        0.01,
        0.25,
        0.5,
        0.75,
        0.99,
    ]
)

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
1%,0.0,1.0,1.0,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
99%,1.0,3.0,65.87,5.0,4.0,249.00622
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [15]:
df.select_dtypes(include=["object"]).describe()

Unnamed: 0,sex,embarked,class,who,deck,embark_town,alive
count,891,889,891,891,203,889,891
unique,2,3,3,3,7,3,2
top,male,S,Third,man,C,Southampton,no
freq,577,644,491,537,59,644,549


### Missing, zero, negative, blank values

In [3]:
n_nulls = df.isnull().sum()
n_zero = df.apply(lambda x: x == 0).sum()
n_negative = df.select_dtypes(include="number").apply(lambda x: x < 0).sum()
n_blank_str = (
    df.select_dtypes(include=object).apply(lambda x: x.str.strip() == "").sum()
)
df_sums = pd.DataFrame(
    {
        "n_missing": n_nulls,
        "n_zero": n_zero,
        "n_negative": n_negative,
        "n_blank_string": n_blank_str,
    }
)
df_sums_pct = (
    df_sums.div(df.shape[0]).mul(100).round(2).rename(columns=lambda x: x + "_pct")
)

In [12]:
_ = df_sums.astype(str)
_ = _.replace("nan", "").replace("NaN", "")
_

Unnamed: 0,n_missing,n_zero,n_negative,n_blank_string
adult_male,0,354,,
age,177,0,0.0,
alive,0,0,,0.0
alone,0,354,,
class,0,0,,0.0
deck,688,0,,0.0
embark_town,2,0,,0.0
embarked,2,0,,0.0
fare,0,15,0.0,
parch,0,678,0.0,


In [13]:
_ = df_sums_pct.astype(str)
_ = _.replace("nan", "").replace("NaN", "")
_

Unnamed: 0,n_missing_pct,n_zero_pct,n_negative_pct,n_blank_string_pct
adult_male,0.0,39.73,,
age,19.87,0.0,0.0,
alive,0.0,0.0,,0.0
alone,0.0,39.73,,
class,0.0,0.0,,0.0
deck,77.22,0.0,,0.0
embark_town,0.22,0.0,,0.0
embarked,0.22,0.0,,0.0
fare,0.0,1.68,0.0,
parch,0.0,76.09,0.0,
