This notebook is meant to validate the results from

https://twitter.com/AviBittMD/status/1718725429693972802

In [1]:
import PyPDF2
import pandas as pd

def remove_pages_from_pdf(input_pdf, output_pdf, start_page, end_page):
    with open(input_pdf, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        writer = PyPDF2.PdfWriter()

        for page_num in range(len(reader.pages)):
            # Page numbers in PyPDF2 start from 0, but are usually 1-based in documents
            if page_num + 1 < start_page or page_num + 1 > end_page:
                writer.add_page(reader.pages[page_num])

        with open(output_pdf, 'wb') as out:
            writer.write(out)


# Remove pages that correspond to missing people, we only look at the confirmed deaths
remove_pages_from_pdf("gaza.pdf", "temp.pdf", 156, 212)
remove_pages_from_pdf("temp.pdf", "gaza_with_missing_removed.pdf", 1, 5)


In [2]:
import tabula.io as tabula

# Define the path to your PDF and the path where you want to save the CSV
pdf_path = "gaza_with_missing_removed.pdf"

# Use tabula to read the PDF
tables = tabula.read_pdf(pdf_path, pages='all', multiple_tables=True, lattice=True, pandas_options={'header':None})


Nov 01, 2023 12:58:05 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>
Nov 01, 2023 12:58:05 PM org.apache.pdfbox.pdmodel.font.PDTrueTypeFont <init>


In [3]:
print(tables)

[        0           1                               2      3      4
0   مسلسل  رقم الهوية                           الاسم  الجنس  العمر
1       1   945271740         عايشة خليل حمدان الاسطل   انثى     72
2       2   975591074        وهبه عبدالله حسين الاسطل   انثى     65
3       3   955256672        ابراهيم حامد حسين الاسطل    ذكر     62
4       4   933751844         محمد سليمان تركي الاسطل    ذكر     61
5       5   901959221         ناديه ياسين حسين الاسطل   انثى     60
6       6   910347657           سلوى محمد خليل الاسطل   انثى     60
7       7   926056078        هناء ابراهيم نعيم الاسطل   انثى     59
8       8   957071368       سليمان محمد سليمان الاسطل    ذكر     58
9       9   957065493           يسري أحمد يوسف الاسطل   انثى     55
10     10   976047365         شفيقه شحادة بحري الاسطل   انثى     54
11     11   947631420            انور محمد علي الاسطل    ذكر     54
12     12   952217016         وائله أحمد مصطفى الاسطل   انثى     51
13     13   900314667          وائل أحمد مصطفى 

In [4]:
tables_with_5_columns = []
tables_other = []

for table in tables:
    if len(table.columns) == 5:
        tables_with_5_columns.append(table)
    else:
        tables_other.append(table)

print(len(tables_with_5_columns))
print(len(tables_other))

147
3


In [5]:
tables_with_5_columns[0]

Unnamed: 0,0,1,2,3,4
0,مسلسل,رقم الهوية,الاسم,الجنس,العمر
1,1,945271740,عايشة خليل حمدان الاسطل,انثى,72
2,2,975591074,وهبه عبدالله حسين الاسطل,انثى,65
3,3,955256672,ابراهيم حامد حسين الاسطل,ذكر,62
4,4,933751844,محمد سليمان تركي الاسطل,ذكر,61
5,5,901959221,ناديه ياسين حسين الاسطل,انثى,60
6,6,910347657,سلوى محمد خليل الاسطل,انثى,60
7,7,926056078,هناء ابراهيم نعيم الاسطل,انثى,59
8,8,957071368,سليمان محمد سليمان الاسطل,ذكر,58
9,9,957065493,يسري أحمد يوسف الاسطل,انثى,55


In [6]:
test = pd.concat(tables_with_5_columns, axis=0, ignore_index=True, join='outer', levels=None, names=None, verify_integrity=False, sort=False, copy=True)
test

Unnamed: 0,0,1,2,3,4
0,مسلسل,رقم الهوية,الاسم,الجنس,العمر
1,1,945271740,عايشة خليل حمدان الاسطل,انثى,72
2,2,975591074,وهبه عبدالله حسين الاسطل,انثى,65
3,3,955256672,ابراهيم حامد حسين الاسطل,ذكر,62
4,4,933751844,محمد سليمان تركي الاسطل,ذكر,61
...,...,...,...,...,...
6608,6743,444166854,زينة أحمد محمد شتات,انثى,أقل من عام
6609,6744,444655245,رتيل ياسر جمعة ابو الفيته,انثى,أقل من عام
6610,6745,445921091,حلا ياسر حامد السنوار,انثى,أقل من عام
6611,6746,444133656,سوار خالد صابر ابو قشلان,انثى,أقل من عام


In [7]:
tables_other[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,,,2924,,,906335922,,,حليمه سلمان محمد بارود,,,انثى,,,81,,,
1,,,2925,,,913160941,,,منور شعبان عبد الله مصلح,,,انثى,,,65,,,
2,,,2926,,,920160819,,,محمود فرحان اسماعيل الديراوي,,,ذكر,,,63,,,
3,,,2927,,,907120166,,,ماجد محمد عامر مصلح,,,ذكر,,,63,,,
4,,,2928,,,921226007,,,ختام أحمد موسى ابو شاربين,,,انثى,,,63,,,
5,,,2929,,,968760660,,,بسام أحمد علي ابو شاربين,,,ذكر,,,62,,,
6,,,2930,,,906335864,,,نعيم سلمان محمد بارود,,,ذكر,,,61,,,
7,,,2931,,,918251141,,,ابراهيم حمدان محمد برهوم,,,ذكر,,,61,,,
8,,,2932,,,917763096,,,صباح محمد أحمد برهوم,,,انثى,,,61,,,
9,,,2933,,,947655197,,,منير سالم محمود الطلاع,,,ذكر,,,61,,,


In [8]:
# Clean up the first table
column_indices = [2, 5, 8, 11, 14]
new_table = tables_other[0].iloc[:, column_indices]
new_table.columns = range(new_table.shape[1])
new_table

Unnamed: 0,0,1,2,3,4
0,2924,906335922,حليمه سلمان محمد بارود,انثى,81
1,2925,913160941,منور شعبان عبد الله مصلح,انثى,65
2,2926,920160819,محمود فرحان اسماعيل الديراوي,ذكر,63
3,2927,907120166,ماجد محمد عامر مصلح,ذكر,63
4,2928,921226007,ختام أحمد موسى ابو شاربين,انثى,63
5,2929,968760660,بسام أحمد علي ابو شاربين,ذكر,62
6,2930,906335864,نعيم سلمان محمد بارود,ذكر,61
7,2931,918251141,ابراهيم حمدان محمد برهوم,ذكر,61
8,2932,917763096,صباح محمد أحمد برهوم,انثى,61
9,2933,947655197,منير سالم محمود الطلاع,ذكر,61


In [9]:
new_table = tables_other[1].iloc[:, column_indices]
new_table.columns = range(new_table.shape[1])
new_table

Unnamed: 0,0,1,2,3,4
0,3014,408540896,هيثم نائل عايش شعت,ذكر,21
1,3015,408766657,ملك ياسر يوسف دواس,انثى,20
2,3016,409339918,نيبال صبري عوض برهوم,انثى,20
3,3017,408769669,يوسف ماهر يوسف دواس,ذكر,20
4,3018,409050747,فارس إيهاب نبيل دبابش,ذكر,20
5,3019,409096211,ملك بسام أحمد ابو شاربين,انثى,20
6,3020,409496429,اسماء هشام مصطفى ابو ناصر,انثى,20
7,3021,412390692,حلا عمر علي الخضري,انثى,20
8,3022,409766839,لينا فضل محمد ابو ناصر,انثى,20
9,3023,408142719,محمد ناصر طلب الخضري,ذكر,20


In [10]:
new_table = tables_other[2].iloc[:, column_indices]
new_table.columns = range(new_table.shape[1])
new_table

Unnamed: 0,0,1,2,3,4
0,5849,405910498,سعيد زياد سعيد عسليه,ذكر,23
1,5850,405987272,زيد حلمي أحمد عقل,ذكر,23
2,5851,407661024,أنس مهدي محمد فياض,ذكر,22
3,5852,406127373,علا عاطف عبد الهادي عطوان,انثى,22
4,5853,407161827,احمد محمود أحمد القطشان,ذكر,22
5,5854,407125905,محمد مجدي ديب النواجحه,ذكر,22
6,5855,407830975,الاء عايش محمود ابو العوف,انثى,22
7,5856,407878388,اسامه أحمد محمد الصوفي,ذكر,22
8,5857,412341380,احمد يوسف ابراهيم ابو عيده,ذكر,22
9,5858,408040830,لينا رفيق صبحي المناصره,انثى,21


In [11]:
# Since the other set of tables all look the same, we can just clean them up in the same way
column_indices = [2, 5, 8, 11, 14]
cleaned_other_tables = []
for table in tables_other:
    new_table = table.iloc[:, column_indices]
    new_table.columns = range(new_table.shape[1])
    cleaned_other_tables.append(new_table)
cleaned_other_tables

[       0          1                             2     3   4
 0   2924  906335922        حليمه سلمان محمد بارود  انثى  81
 1   2925  913160941      منور شعبان عبد الله مصلح  انثى  65
 2   2926  920160819  محمود فرحان اسماعيل الديراوي   ذكر  63
 3   2927  907120166           ماجد محمد عامر مصلح   ذكر  63
 4   2928  921226007     ختام أحمد موسى ابو شاربين  انثى  63
 5   2929  968760660      بسام أحمد علي ابو شاربين   ذكر  62
 6   2930  906335864         نعيم سلمان محمد بارود   ذكر  61
 7   2931  918251141      ابراهيم حمدان محمد برهوم   ذكر  61
 8   2932  917763096          صباح محمد أحمد برهوم  انثى  61
 9   2933  947655197        منير سالم محمود الطلاع   ذكر  61
 10  2934  906905153      سامي يوسف ابراهيم الخضري   ذكر  60
 11  2935  927031476      نهله محمد محمود ابو ناصر  انثى  60
 12  2936  920883287           أحمد سليم عبد دبابش   ذكر  58
 13  2937  966657942      هشام مصطفى سليم ابو ناصر   ذكر  56
 14  2938  903456275           ماهر يوسف محمد دواس   ذكر  56
 15  2939  927031492    

In [12]:
for table in cleaned_other_tables:
    tables_with_5_columns.append(table)
    
final_df = pd.concat(tables_with_5_columns, axis=0, ignore_index=True, join='outer', levels=None, names=None, verify_integrity=False, sort=False, copy=True)
print(final_df.columns)
final_df

RangeIndex(start=0, stop=5, step=1)


Unnamed: 0,0,1,2,3,4
0,مسلسل,رقم الهوية,الاسم,الجنس,العمر
1,1,945271740,عايشة خليل حمدان الاسطل,انثى,72
2,2,975591074,وهبه عبدالله حسين الاسطل,انثى,65
3,3,955256672,ابراهيم حامد حسين الاسطل,ذكر,62
4,4,933751844,محمد سليمان تركي الاسطل,ذكر,61
...,...,...,...,...,...
6743,5889,424155307,شذا عمر صابر كلوب,انثى,17
6744,5890,421850793,بلال حسن صلاح السنداوي,ذكر,16
6745,5891,424564615,عبد الكريم نبيل عبد الكريم داود,ذكر,16
6746,5892,424558849,محمد علي نائل علي الخليلي,ذكر,16


In [13]:
# Sanity check

print(final_df[0].nunique() == final_df.shape[0])
print(final_df[1].nunique() == final_df.shape[0])

True
False


In [14]:
# create a working copy where we modify the columns 
df = final_df.copy()

In [15]:
# This is a duplicate in the hamas dataset on page 5
duplicated_rows = df[df.iloc[:, 1].duplicated(keep=False)]
print(duplicated_rows)

       0          1                        2    3   4
200  200  426457511  يزن سليمان سلمان النجار  ذكر  14
201  201  426457511  يزن سليمان سلمان النجار  ذكر  14


In [16]:
# Drop the first row
df = df.iloc[1:]

# Convert every entry in the first column to an integer
df[df.columns[0]] = df[df.columns[0]].astype(int)


In [17]:
df = df.sort_values(by=df.columns[0])
df = df.reset_index(drop=True)
df

Unnamed: 0,0,1,2,3,4
0,1,945271740,عايشة خليل حمدان الاسطل,انثى,72
1,2,975591074,وهبه عبدالله حسين الاسطل,انثى,65
2,3,955256672,ابراهيم حامد حسين الاسطل,ذكر,62
3,4,933751844,محمد سليمان تركي الاسطل,ذكر,61
4,5,901959221,ناديه ياسين حسين الاسطل,انثى,60
...,...,...,...,...,...
6742,6743,444166854,زينة أحمد محمد شتات,انثى,أقل من عام
6743,6744,444655245,رتيل ياسر جمعة ابو الفيته,انثى,أقل من عام
6744,6745,445921091,حلا ياسر حامد السنوار,انثى,أقل من عام
6745,6746,444133656,سوار خالد صابر ابو قشلان,انثى,أقل من عام


In [18]:
# Find the missing rows
expected_set = set(range(1, 6597)) # 1 to 6597 inclusive
actual_set = set(df.iloc[:, 0])
missing_numbers = expected_set - actual_set
print(sorted(list(missing_numbers)))
print("number of missing entries:",len(missing_numbers))

[]
number of missing entries: 0


In [19]:
unique_keys = df.iloc[:, 3].unique()
unique_keys

array(['انثى', 'ذكر'], dtype=object)

In [20]:
df.iloc[:, 3] = df.iloc[:, 3].map(lambda x: 'M' if 'ذكر' in x else 'F')
df

Unnamed: 0,0,1,2,3,4
0,1,945271740,عايشة خليل حمدان الاسطل,F,72
1,2,975591074,وهبه عبدالله حسين الاسطل,F,65
2,3,955256672,ابراهيم حامد حسين الاسطل,M,62
3,4,933751844,محمد سليمان تركي الاسطل,M,61
4,5,901959221,ناديه ياسين حسين الاسطل,F,60
...,...,...,...,...,...
6742,6743,444166854,زينة أحمد محمد شتات,F,أقل من عام
6743,6744,444655245,رتيل ياسر جمعة ابو الفيته,F,أقل من عام
6744,6745,445921091,حلا ياسر حامد السنوار,F,أقل من عام
6745,6746,444133656,سوار خالد صابر ابو قشلان,F,أقل من عام


In [21]:
count_F = df[df.iloc[:, 3] == 'F'].shape[0]
total = len(df)
print(count_F)
count_M = total-count_F
print(count_M)
print("% men:", count_M/total)
print("% women:", count_F/total)

2902
3845
% men: 0.5698829109233734
% women: 0.43011708907662666


In [22]:
# Let's check how many of these entries are not integers and what they are instead
unique_keys = df.iloc[:, 4].unique()
def is_integer_string(s):
    try:
        int(s)
        return True
    except ValueError:
        return False
        
for key in unique_keys:
    if not is_integer_string(key):
        print(key)

أقل من عام


In [23]:
# أقل من عام means less than one year old.

# For our purposes we will assume non-integer entries are 0.5 years old.
def convert_to_float_or_half(s):
    trimmed = ""
    try:
        trimmed = s.strip()
    except Exception:
        trimmed = s
    try:
        return float(trimmed)
    except ValueError:
        return 0.5

df.iloc[:, 4] = df.iloc[:, 4].apply(convert_to_float_or_half)
df

Unnamed: 0,0,1,2,3,4
0,1,945271740,عايشة خليل حمدان الاسطل,F,72.0
1,2,975591074,وهبه عبدالله حسين الاسطل,F,65.0
2,3,955256672,ابراهيم حامد حسين الاسطل,M,62.0
3,4,933751844,محمد سليمان تركي الاسطل,M,61.0
4,5,901959221,ناديه ياسين حسين الاسطل,F,60.0
...,...,...,...,...,...
6742,6743,444166854,زينة أحمد محمد شتات,F,0.5
6743,6744,444655245,رتيل ياسر جمعة ابو الفيته,F,0.5
6744,6745,445921091,حلا ياسر حامد السنوار,F,0.5
6745,6746,444133656,سوار خالد صابر ابو قشلان,F,0.5


In [24]:
female_or_minor_count = df[(df.iloc[:, 3] == 'F') | (df.iloc[:, 4] < 18)]
female_or_minor_count = len(female_or_minor_count)
print(female_or_minor_count)
print("% of people who are either female or a minor", female_or_minor_count/len(df))

4346
% of people who are either female or a minor 0.6441381354676152


https://www.indexmundi.com/gaza_strip/demographics_profile.html

0-14 years: 42.53% (male 418,751/female 397,013)

15-24 years: 21.67% (male 210,240/female 205,385)

25-54 years: 29.47% (male 275,976/female 289,277)

55-64 years: 3.66% (male 36,409/female 33,731)

65 years and over: 2.68% (male 27,248/female 24,191) (2020 est.)

In [25]:
# July 2021 Gaza Population Info (https://www.indexmundi.com/gaza_strip/demographics_profile.html)
gaza_population = 1957062
# Population by age and gender

total_deaths = len(df)

# 0-14 years
population_0_14_male = 418751
population_0_14_female = 397013
population_0_14 = population_0_14_male + population_0_14_female

# 15-24 years
population_15_24_male = 210240
population_15_24_female = 205385
population_15_24 = population_15_24_male + population_15_24_female

# 25-54 years
population_25_54_male = 275976
population_25_54_female = 289277
population_25_54 = population_25_54_male + population_25_54_female

# 55-64 years
population_55_64_male = 36409
population_55_64_female = 33731
population_55_64 = population_55_64_male + population_55_64_female

# 65 years and over
population_65_over_male = 27248
population_65_over_female = 24191
population_65_over = population_65_over_male + population_65_over_female

In [26]:
# Get the entries for males
male_df = df[df[3] == 'M']

# create bins for pd.cut
bins = [0, 14, 24, 54, 64, float('inf')]
# labels for bins
m_labels = ['Males 0-14', 'Males 15-24', 'Males 25-54', 'Males 55-64', 'Males 65+']
# Group males by age
observed_male_age_series = pd.cut(male_df[4], bins=bins, labels=m_labels, right=True)
# Sum the bins
observed_male_age_groups = observed_male_age_series.value_counts(sort=False).tolist()
observed_male_age_groups

[1212, 733, 1541, 198, 161]

In [27]:
# Get the entries for females
female_df = df[df[3] == 'F']

# create bins for pd.cut
bins = [0, 14, 24, 54, 64, float('inf')]
# labels for bins
f_labels = ['Females 0-14', 'Females 15-24', 'Females 25-54', 'Females 55-64', 'Females 65+']
# Group females by age
observed_female_age_series = pd.cut(female_df[4], bins=bins, labels=f_labels, right=True)
# Sum the bins
observed_female_age_groups = observed_female_age_series.value_counts(sort=False).tolist()
observed_female_age_groups

[1065, 494, 1045, 171, 127]

In [28]:
from scipy.stats import chi2_contingency

male_demographics_vector = [population_0_14_male, population_15_24_male, population_25_54_male, population_55_64_male, population_65_over_male]
expected_male = [(x/gaza_population)*total_deaths for x in male_demographics_vector]

female_demographics_vector = [population_0_14_female, population_15_24_female, population_25_54_female, population_55_64_female, population_65_over_female]
expected_female = [(x/gaza_population)*total_deaths for x in female_demographics_vector]

percent_female = [(x/total_deaths) for x in observed_female_age_groups]
percent_male = [(x/total_deaths) for x in observed_male_age_groups]

percent_female_pop = [(x/gaza_population) for x in female_demographics_vector]
percent_male_pop = [(x/gaza_population) for x in male_demographics_vector]

data_males = {
    'Category': m_labels, # Add all categories
    'Observed': observed_male_age_groups, # Number of deaths in each age group
    'Population': male_demographics_vector, # Population by age
    '% of Total Deaths': percent_male,
    '% of Total Population': percent_male_pop,
    'Expected': expected_male   # Expected deaths based on Gaza demographics
}
males_df = pd.DataFrame(data_males)

data_females = {
    'Category': f_labels, # Add all categories
    'Observed': observed_female_age_groups, # Number of deaths in each age group
    'Population': female_demographics_vector,
    '% of Total Deaths': percent_female,
    '% of Total Population': percent_female_pop,
    'Expected': expected_female   # Expected deaths based on Gaza demographics
}
females_df = pd.DataFrame(data_females)

combined_labels = ['0-14', '15-24', '25-54', '55-64', '65+']
combined_data = {
    'Category': combined_labels,
    'Observed': males_df['Observed'] + females_df['Observed'],
    'Population': males_df['Population'] + females_df['Population'],
    '% of Total Deaths': males_df['% of Total Deaths'] + females_df['% of Total Deaths'],
    '% of Total Population': males_df['% of Total Population'] + females_df['% of Total Population'],
    'Expected': males_df['Expected'] + females_df['Expected']
}
combined_df = pd.DataFrame(combined_data)

In [29]:
males_df.set_index('Category', inplace=True)
males_df

Unnamed: 0_level_0,Observed,Population,% of Total Deaths,% of Total Population,Expected
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Males 0-14,1212,418751,0.179635,0.213969,1443.650225
Males 15-24,733,210240,0.108641,0.107426,724.805489
Males 25-54,1541,275976,0.228398,0.141015,951.431315
Males 55-64,198,36409,0.029346,0.018604,125.520562
Males 65+,161,27248,0.023862,0.013923,93.93788


In [30]:
females_df.set_index('Category', inplace=True)
females_df

Unnamed: 0_level_0,Observed,Population,% of Total Deaths,% of Total Population,Expected
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Females 0-14,1065,397013,0.157848,0.202862,1368.708151
Females 15-24,494,205385,0.073218,0.104946,708.067805
Females 25-54,1045,289277,0.154884,0.147812,997.286708
Females 55-64,171,33731,0.025345,0.017236,116.288118
Females 65+,127,24191,0.018823,0.012361,83.398828


In [31]:
combined_df.set_index('Category', inplace=True)
combined_df

Unnamed: 0_level_0,Observed,Population,% of Total Deaths,% of Total Population,Expected
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0-14,2277,815764,0.337483,0.416831,2812.358376
15-24,1227,415625,0.181859,0.212372,1432.873294
25-54,2586,565253,0.383281,0.288827,1948.718023
55-64,369,70140,0.054691,0.035839,241.808681
65+,288,51439,0.042686,0.026284,177.336708


In [32]:
def print_chi2_from_contingency_table(table):
    chi2, p, _, _ = chi2_contingency(table, correction=False)
    print("contingency table:", table)
    print("chi2:", chi2)
    print("p value:", p)


In [33]:
# Break it down by gender

females_killed = females_df['Observed'].sum()
females_not_killed = sum(female_demographics_vector)  - females_killed
males_killed = males_df['Observed'].sum()
males_not_killed = sum(male_demographics_vector)  - males_killed

gender_contingency_table = [[males_killed, males_not_killed], [females_killed, females_not_killed]]
print_chi2_from_contingency_table(gender_contingency_table)

contingency table: [[3845, 964779], [2902, 946695]]
chi2: 114.16835873378572
p value: 1.1970133773247308e-26


There is a very statistically significant disproportionate favoring of males over females.

In [34]:
# Break it down by age
# ['0-14', '15-24', '25-54', '55-64', '65+']

killed_0_14 = combined_df.at['0-14', 'Observed']
not_killed_0_14 = combined_df.at['0-14', 'Population'] - killed_0_14

killed_15_24 = combined_df.at['15-24', 'Observed']
not_killed_15_24 = combined_df.at['15-24', 'Population'] - killed_15_24

killed_25_54 = combined_df.at['25-54', 'Observed']
not_killed_25_54 = combined_df.at['25-54', 'Population'] - killed_25_54

killed_55_64= combined_df.at['55-64', 'Observed']
not_killed_55_64 = combined_df.at['55-64', 'Population'] - killed_55_64

killed_65_plus = combined_df.at['65+', 'Observed']
not_killed_65_plus = combined_df.at['65+', 'Population'] - killed_65_plus


age_consistency_table = [[killed_0_14, not_killed_0_14], 
                         [killed_15_24, not_killed_15_24],
                         [killed_25_54, not_killed_25_54], 
                         [killed_55_64, not_killed_55_64],
                        [killed_65_plus, not_killed_65_plus]]
print_chi2_from_contingency_table(age_consistency_table)

contingency table: [[2277, 813487], [1227, 414398], [2586, 562667], [369, 69771], [288, 51151]]
chi2: 465.39291944018606
p value: 2.0410973480143927e-99


The statistical significance between those that are killed and those that are not based on their age cannot be understated...

In [35]:
# Break it down by age (children vs not children)

children_killed = combined_df.at['0-14', 'Observed']
children_not_killed = combined_df.at['0-14', 'Population'] - children_killed
adults_killed = combined_df.drop('0-14').loc[:, 'Observed'].sum()
adults_not_killed = combined_df.drop('0-14').loc[:, 'Population'].sum() - adults_killed

age_consistency_table_children = [[children_killed, children_not_killed],[adults_killed, adults_not_killed]]
print_chi2_from_contingency_table(age_consistency_table_children)

contingency table: [[2277, 813487], [4470, 1097987]]
chi2: 213.49138353099508
p value: 2.37779289536584e-48


There is a very statistically significant disproportionate favoring of children over adults

In [36]:
# Break it down by age for men (children vs adults)

male_children_killed = males_df.at['Males 0-14', 'Observed']
male_children_not_killed = males_df.at['Males 0-14', 'Population'] - male_children_killed
male_adults_killed = males_df.drop('Males 0-14').loc[:, 'Observed'].sum()
male_adults_not_killed = males_df.drop('Males 0-14').loc[:, 'Population'].sum() - male_adults_killed

male_age_consistency_table = [[male_children_killed, male_children_not_killed],[male_adults_killed, male_adults_not_killed]]
print_chi2_from_contingency_table(male_age_consistency_table)

contingency table: [[1212, 417539], [2633, 547240]]
chi2: 215.69265854967588
p value: 7.869842036893857e-49


There is a very statistically significant disproportionate favoring of children over adults among the male population

In [37]:
# Break it down by age for women (children vs adults)

female_children_killed = females_df.at['Females 0-14', 'Observed']
female_children_not_killed = females_df.at['Females 0-14', 'Population'] - female_children_killed
female_adults_killed = females_df.drop('Females 0-14').loc[:, 'Observed'].sum()
female_adults_not_killed = females_df.drop('Females 0-14').loc[:, 'Population'].sum() - female_adults_killed

female_age_consistency_table = [[female_children_killed, female_children_not_killed],[female_adults_killed, female_adults_not_killed]]
print_chi2_from_contingency_table(female_age_consistency_table)

contingency table: [[1065, 395948], [1837, 550747]]
chi2: 31.23931389880708
p value: 2.2809962038659684e-08


There is a very statistically significant disproportionate favoring of children over adults among the female population, although less significant than the identical hypothesis among men

In [38]:
# Break it down by age for men, 15-54 vs other age groups

males_15_to_24_killed = males_df.at['Males 0-14', 'Observed'] + males_df.at['Males 25-54', 'Observed']
males_15_to_24_not_killed = males_df.at['Males 0-14', 'Population'] - males_15_to_24_killed
other_males_killed = males_df.drop('Males 15-24').drop('Males 25-54').loc[:, 'Observed'].sum()
other_males_not_killed = males_df.drop('Males 0-14').drop('Males 25-54').loc[:, 'Population'].sum() - other_males_killed

male_other_age_consistency_table = [[males_15_to_24_killed, males_15_to_24_not_killed],[other_males_killed, other_males_not_killed]]
print_chi2_from_contingency_table(male_other_age_consistency_table)

contingency table: [[2753, 415998], [1571, 272326]]
chi2: 18.770074603020092
p value: 1.4746290067322166e-05


There is a very statistically significant disproportionate favoring of men aged 15-54 than at other age groups

In [39]:
# Break it down by age for women, 15-54 vs other age groups

females_15_to_24_killed = females_df.at['Females 0-14', 'Observed'] + females_df.at['Females 25-54', 'Observed']
females_15_to_24_not_killed = females_df.at['Females 0-14', 'Population'] - females_15_to_24_killed
other_females_killed = females_df.drop('Females 15-24').drop('Females 25-54').loc[:, 'Observed'].sum()
other_females_not_killed = females_df.drop('Females 0-14').drop('Females 25-54').loc[:, 'Population'].sum() - other_females_killed

female_other_age_consistency_table = [[females_15_to_24_killed, females_15_to_24_not_killed],[other_females_killed, other_females_not_killed]]
print_chi2_from_contingency_table(female_other_age_consistency_table)

contingency table: [[2110, 394903], [1363, 261944]]
chi2: 0.5780920045003466
p value: 0.44706116752276526


In this case, we fail to reject the null hypothesis that there is no discrepancy between women aged 15-54 and other age groups.