In [32]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import matplotlib.pyplot as plt
import seaborn as sns

##We need to perform chi square tests for finding dependencies between various categorical variables

In [3]:
attacks_activities_cleaned = pd.read_csv('attacks_cleaned_activity.csv')

In [4]:
attacks_activities_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3760 entries, 0 to 3759
Data columns (total 19 columns):
Unnamed: 0                3760 non-null int64
Case Number               3760 non-null object
Date                      3760 non-null object
Year                      3760 non-null int64
Type                      3760 non-null object
Country                   3760 non-null object
Area                      3760 non-null object
Location                  3760 non-null object
Activity                  3760 non-null object
Name                      3760 non-null object
Sex                       3760 non-null object
Age                       3760 non-null float64
Injury                    3760 non-null object
Is_Fatal                  3760 non-null object
Investigator or Source    3760 non-null object
col                       3760 non-null object
Month                     3760 non-null int64
Activity_new              3760 non-null object
Count                     3760 non-null int64
dty

## Here Is_Fatal can be considered the target variable so lets check out dependencies with respect to that

In [6]:
attacks_activities_cleaned = attacks_activities_cleaned[attacks_activities_cleaned['Is_Fatal']!='UNKNOWN']

In [10]:
attacks_activities_cleaned = attacks_activities_cleaned[attacks_activities_cleaned['Is_Fatal']!=' N']

In [11]:
crosstab_1 = pd.crosstab(attacks_activities_cleaned.Activity_new,attacks_activities_cleaned.Is_Fatal,margins=True)
crosstab_1

Is_Fatal,N,Y,All
Activity_new,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bathing,75,61,136
Bodyboarding,95,24,119
Boogie boarding,46,2,48
Diving,260,86,346
Fishing,309,63,372
Floating,32,6,38
Kayaking,41,3,44
Playing,20,1,21
Snorkeling,64,8,72
Spearfishing,270,44,314


# Ho:There is no dependency between Fatality and Activity
# H1:There is  dependency

In [12]:
#Chi-square between Is_Fatal and Activity
chi2_contingency(crosstab_1)

(408.22548813519523,
 4.14400667633238e-69,
 28,
 array([[ 107.7105474 ,   28.2894526 ,  136.        ],
        [  94.24672897,   24.75327103,  119.        ],
        [  38.01548732,    9.98451268,   48.        ],
        [ 274.02830441,   71.97169559,  346.        ],
        [ 294.6200267 ,   77.3799733 ,  372.        ],
        [  30.09559413,    7.90440587,   38.        ],
        [  34.84753004,    9.15246996,   44.        ],
        [  16.6317757 ,    4.3682243 ,   21.        ],
        [  57.02323097,   14.97676903,   72.        ],
        [ 248.68464619,   65.31535381,  314.        ],
        [ 135.43017356,   35.56982644,  171.        ],
        [ 750.01388518,  196.98611482,  947.        ],
        [ 702.49452603,  184.50547397,  887.        ],
        [ 182.15754339,   47.84245661,  230.        ],
        [2966.        ,  779.        , 3745.        ]]))

# Given P-value is lesser then significant value(alpha = 0.05) hence we can reject null hypothesis
# There is a dependency between Fatality and Activity

In [18]:
attacks_activities_cleaned = attacks_activities_cleaned[(attacks_activities_cleaned['Sex ']!='lli') & (attacks_activities_cleaned['Sex ']!='M ') & (attacks_activities_cleaned['Sex ']!='N')]


In [19]:
attacks_activities_cleaned = attacks_activities_cleaned.rename(columns={'Sex ':'Sex'})

In [20]:
crosstab_2 = pd.crosstab(attacks_activities_cleaned.Sex,attacks_activities_cleaned.Is_Fatal,margins=True)
crosstab_2

Is_Fatal,N,Y,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
F,362,77,439
M,2600,702,3302
All,2962,779,3741


# Ho:There is no dependency between Sex and Fatality
# H1:There is dependency 

In [22]:
#Chi-square between Is_Fatal and Sex
chi2_contingency(crosstab_2)

(3.2522790922114146,
 0.516528593695736,
 4,
 array([[ 347.58567228,   91.41432772,  439.        ],
        [2614.41432772,  687.58567228, 3302.        ],
        [2962.        ,  779.        , 3741.        ]]))

# Since p-value is greater then alpha we accept null
# Here we find that Fatality and Sex is independent

In [23]:
#Now for Month and Fatality
crosstab_3 = pd.crosstab(attacks_activities_cleaned.Month,attacks_activities_cleaned.Is_Fatal,margins=True)
crosstab_3

Is_Fatal,N,Y,All
Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,244,82,326
2,183,59,242
3,200,56,256
4,225,65,290
5,205,41,246
6,269,61,330
7,358,85,443
8,336,78,414
9,296,68,364
10,254,46,300


In [24]:
#Chi-square between Is_Fatal and Month
chi2_contingency(crosstab_3)

(45.34039320189443,
 0.005308192456852075,
 24,
 array([[ 258.11601176,   67.88398824,  326.        ],
        [ 191.60759155,   50.39240845,  242.        ],
        [ 202.69232825,   53.30767175,  256.        ],
        [ 229.6124031 ,   60.3875969 ,  290.        ],
        [ 194.77465918,   51.22534082,  246.        ],
        [ 261.28307939,   68.71692061,  330.        ],
        [ 350.75273991,   92.24726009,  443.        ],
        [ 327.7914996 ,   86.2085004 ,  414.        ],
        [ 288.20315424,   75.79684576,  364.        ],
        [ 237.53007217,   62.46992783,  300.        ],
        [ 197.94172681,   52.05827319,  250.        ],
        [ 221.69473403,   58.30526597,  280.        ],
        [2962.        ,  779.        , 3741.        ]]))

# P-value lesser then alpha we reject null so there is a dependency between month and fatality

In [25]:
#Same for Country and fatality
crosstab_4 = pd.crosstab(attacks_activities_cleaned.Country,attacks_activities_cleaned.Is_Fatal,margins=True)
crosstab_4

Is_Fatal,N,Y,All
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TONGA,1,0,1
AMERICAN SAMOA,0,3,3
ANTIGUA,1,0,1
ARGENTINA,1,0,1
AUSTRALIA,620,231,851
...,...,...,...
VIETNAM,5,0,5
WESTERN SAMOA,0,1,1
YEMEN,0,1,1
YEMEN,0,2,2


In [26]:
#Chi-square between Is_Fatal and Month
chi2_contingency(crosstab_4)

(549.8782286665478,
 4.227733344821076e-27,
 236,
 array([[7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [2.37530072e+00, 6.24699278e-01, 3.00000000e+00],
        [7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [6.73793638e+02, 1.77206362e+02, 8.51000000e+02],
        [3.87965785e+01, 1.02034215e+01, 4.90000000e+01],
        [2.37530072e+00, 6.24699278e-01, 3.00000000e+00],
        [7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [4.75060144e+00, 1.24939856e+00, 6.00000000e+00],
        [5.54236835e+01, 1.45763165e+01, 7.00000000e+01],
        [7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [1.58353381e+00, 4.16466186e-01, 2.00000000e+00],
        [4.75060144e+00, 1.24939856e+00, 6.00000000e+00],
        [7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [3.16706763e+00, 8.32932371e-01, 4.00000000e+00],
        [4.75060144e+0

# There is dependency between Country and Fatality

In [27]:
#Same for Area and fatality
crosstab_5 = pd.crosstab(attacks_activities_cleaned.Area,attacks_activities_cleaned.Is_Fatal,margins=True)
crosstab_5

Is_Fatal,N,Y,All
Area,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Kikori River mouth,1,0,1
Lau Province,1,0,1
Loyalty Islands,1,0,1
Primorje-Gorski Kotar County,0,4,4
"Split-Dalmatia Count,",1,2,3
...,...,...,...
Yasawa Islands,0,1,1
Ysabel Island,0,1,1
Zadar County,1,0,1
d’Étang-Salé,1,1,2


In [28]:
#Chi-square between Is_Fatal and Area
chi2_contingency(crosstab_5)

(983.7602402838941,
 0.00239624139736027,
 862,
 array([[7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        ...,
        [7.91766907e-01, 2.08233093e-01, 1.00000000e+00],
        [1.58353381e+00, 4.16466186e-01, 2.00000000e+00],
        [2.96200000e+03, 7.79000000e+02, 3.74100000e+03]]))

# Weirdly there is no dependency between area and fatality whereas there was between Country and Fatality