## Import datasets

In [1]:
import os
import pandas as pd

PATH = "C:/Users/Ato/Documents/Programming/Python/Titanic/data/"
titanic_dataset = "train.csv" # 2000 rows
titatinc_dataset_path = os.path.join(PATH, titanic_dataset)


df = pd.read_csv(titatinc_dataset_path)
df.head(10)

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### Runtimes

- Method to check runtimes

In [67]:
import time

def method_runtime(method):
  start = time.time()
  method()
  return (time.time() - start) * 1000 #in ms

- Methods to check missing values

In [79]:
def isnull_any():
  return df['Age'].isnull().any()

  
def isnull_values_sum():
  return df['Age'].isnull().values.sum() > 0


def isnull_sum():
  return df['Age'].isnull().sum() > 0


def isnull_values_any():
  return df['Age'].isnull().values.any()


def isna_any():
  return df['Age'].isna().any()

methods = [isnull_any, isnull_values_sum, isnull_sum, isnull_values_any, isna_any]

In [71]:
def runtimes(iterations):
  accumulated_runtime = [0 for method in range(len(methods))]

  for _ in range(iterations):
    for i, method in enumerate(methods):
      accumulated_runtime[i] += method_runtime(method)

  return [acc/iterations for acc in accumulated_runtime]

In [75]:
iter1000 = runtimes(1000)
iter10000 = runtimes(10000)
iter100000 = runtimes(100000)
iter1000000 = runtimes(1000000)


print(f'1000 Iterations: {iter1000}')
print(f'10000 Iterations: {iter10000}')
print(f'100000 Iterations: {iter100000}')
print(f'100000 Iterations: {iter1000000}')

1000 Iterations: [0.03929924964904785, 0.034543752670288086, 0.04612994194030762, 0.03412652015686035, 0.0354306697845459]
10000 Iterations: [0.03687276840209961, 0.035236024856567384, 0.045395946502685545, 0.03401892185211182, 0.03358380794525147]
100000 Iterations: [0.0349312424659729, 0.03636216878890991, 0.04346524238586426, 0.036452763080596924, 0.03595728635787964]
100000 Iterations: [0.03659474635124207, 0.03670069766044617, 0.046035794973373415, 0.03737280344963074, 0.03612803196907043]


- The same using "timeit"

In [87]:
import timeit

setup = """
import os
import pandas as pd

PATH = "C:/Users/Ato/Documents/Programming/Python/Titanic/data/"
titanic_dataset = "train.csv" # 2000 rows
titatinc_dataset_path = os.path.join(PATH, titanic_dataset)

df = pd.read_csv(titatinc_dataset_path)

def isnull_any():
  return df['Age'].isnull().any()

  
def isnull_values_sum():
  return df['Age'].isnull().values.sum() > 0


def isnull_sum():
  return df['Age'].isnull().sum() > 0


def isnull_values_any():
  return df['Age'].isnull().values.any()


def isna_any():
  return df['Age'].isna().any()
"""

num_iter = 100000

isnull_any_time = timeit.timeit("isnull_any()", setup=setup, number=num_iter)
isnull_values_sum_time = timeit.timeit("isnull_values_sum()",setup=setup, number=num_iter)
isnull_values_any_time = timeit.timeit("isnull_values_any()", setup=setup, number=num_iter)
isnull_sum_time = timeit.timeit("isnull_sum()", setup=setup, number=num_iter)
isna_any_time = timeit.timeit("isna_any()", setup=setup, number=num_iter)


In [88]:
print(f"isnull_any: {isnull_any_time / num_iter * 1000} ms")
print(f"isnull_values_sum: {isnull_values_sum_time / num_iter * 1000} ms")
print(f"isnull_values_any: {isnull_values_any_time / num_iter * 1000} ms")
print(f"isnull_sum: {isnull_sum_time / num_iter * 1000} ms")
print(f"isna_any_time: {isnull_sum_time / num_iter * 1000} ms")

isnull_any: 0.0351682720000008 ms
isnull_values_sum: 0.03399373799999921 ms
isnull_values_any: 0.03232469400000809 ms
isnull_sum: 0.044357607000001735 ms
isna_any_time: 0.044357607000001735 ms


### Comparing both libraries results