In [1]:
#importing libraries, the data and shaping the latter as a dataframe
import pandas as pd
import numpy as np
import sys
import re
attacks = pd.read_csv("ignoredstuff/attacks.csv",encoding = "ISO-8859-1")

# A. Initial exploration and general basic cleaning

In [2]:
#df's shape
attacks.shape

(25723, 24)

In [3]:
#df's columns
columns_attacks = attacks.columns
columns_attacks

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [4]:
#a quick view...
attacks.sample(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
2309,1996.04.25.a,25-Apr-1996,1996.0,Unprovoked,AUSTRALIA,New South Wales,Mona Vale,Swimming,Aya Hamaea,F,...,Wobbegong shark,"Daily Telegraph, 4/26/1996, p.9",1996.04.25.a-Hamaea.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1996.04.25.a,1996.04.25.a,3994.0,,
18828,,,,,,,,,,,...,,,,,,,,,,
19871,,,,,,,,,,,...,,,,,,,,,,
13885,,,,,,,,,,,...,,,,,,,,,,
15029,,,,,,,,,,,...,,,,,,,,,,
17818,,,,,,,,,,,...,,,,,,,,,,
6236,ND.0074,"No date, After August 1926 and before 1936",0.0,Unprovoked,AUSTRALIA,Western Australia,Cossack Creek,Pearl diving,Ted Luck,M,...,,"N. Caldwell; T. Peake, GSAF",ND-0074-Ted-Luck.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,ND.0074,ND.0074,67.0,,
22425,,,,,,,,,,,...,,,,,,,,,,
1258,2008.05.14,14-May-2008,2008.0,Unprovoked,FIJI,Yasawa Islands,Turtle Island,Night diving,Aisake Sadole,M,...,,"Fiji Times, 5/15/2008",2008.05.14-Sidole.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2008.05.14,2008.05.14,5045.0,,
10598,,,,,,,,,,,...,,,,,,,,,,


In [5]:
#elemental informatio
attacks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

In [6]:
#looking for NaNs
attacks.isna().sum()

Case Number               17021
Date                      19421
Year                      19423
Type                      19425
Country                   19471
Area                      19876
Location                  19961
Activity                  19965
Name                      19631
Sex                       19986
Age                       22252
Injury                    19449
Fatal (Y/N)               19960
Time                      22775
Species                   22259
Investigator or Source    19438
pdf                       19421
href formula              19422
href                      19421
Case Number.1             19421
Case Number.2             19421
original order            19414
Unnamed: 22               25722
Unnamed: 23               25721
dtype: int64

In [7]:
#given the high number of NaNs it will be better to work with them before worrying about descriptive statistics...

In [8]:
#deleting all the rows exclusively composed by null values
attacks_1_1 = attacks.dropna(how="all")

In [9]:
#check again the number of NaNs
attacks_1_1.isna().sum()
#17020 rows contained only null values and were thus deleted


Case Number                  1
Date                      2401
Year                      2403
Type                      2405
Country                   2451
Area                      2856
Location                  2941
Activity                  2945
Name                      2611
Sex                       2966
Age                       5232
Injury                    2429
Fatal (Y/N)               2940
Time                      5755
Species                   5239
Investigator or Source    2418
pdf                       2401
href formula              2402
href                      2401
Case Number.1             2401
Case Number.2             2401
original order            2394
Unnamed: 22               8702
Unnamed: 23               8701
dtype: int64

In [10]:
#checking the shape of our new "cleaner" df
attacks_1_1.shape

(8703, 24)

In [11]:
#comparing the info provided by .shape and the sum of null given by .isna().sum()
#the columns "Unnamed: 22" & "Unnamed: 23" will be deleted as the contain basically null values
#which are useless and create a gap of such a mgnitude that is simply unfeasible to fill 

attacks_1_2 = attacks_1_1.drop(["Unnamed: 22" , "Unnamed: 23"], axis=1)


In [12]:
#our new df's columns
columns_attacks_1_2 = attacks_1_2.columns
columns_attacks_1_2

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order'],
      dtype='object')

In [13]:
#taking a look at a sample
attacks_1_2.sample(10)

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Fatal (Y/N),Time,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order
5802,1882.01.05.R,Reprted 05-Jan-1882,1882.0,Unprovoked,ATLANTIC OCEAN,,,Fell overboard from the Selim,male,M,...,Y,,,"Marion Star, 1/5/1882",1882.01.05.R-Selim.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1882.01.05.R,1882.01.05.R,501.0
1600,2005.04.06,06-Apr-2005,2005.0,Invalid,HONDURAS,Bay Islands,Utila,SCUBA Diving,female,F,...,,,Shark involvement not confirmed,"J. Engel, SRI & S. Fox, Deep Blue",2005.04.06.b-Utila.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2005.04.06,2005.04.06,4703.0
988,2010.09.07,07-Sep-2010,2010.0,Unprovoked,USA,Florida,"St. Augustine, St. John's County",Swimming,Jason Whitworth,M,...,N,,3' shark,"St. Augustine Record, 9/9/2010",2010.09.07-Whitworth.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2010.09.07,2010.09.07,5315.0
7040,0,,,,,,,,,,...,,,,,,,,,,
1361,2007.07.17.b,17-Jul-2007,2007.0,Invalid,USA,California,"Faria Beach, Ventura County",Swimming,Susan Levy,F,...,,11h00,Shark involvement not confirmed,R. Collier,2007.07.17.b-SusanLevy.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2007.07.17.b,2007.07.17.b,4942.0
1683,2004.03.31.b,31-Mar-2004,2004.0,Unprovoked,USA,Florida,"Ocean Reef Park, Singer Island, Palm Beach County",,Todd Rapp,M,...,N,,,"D. Davies, Palm Beach Post, 6/3/2004",2004.03.31.b-Rapp.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2004.03.31.b,2004.03.31.b,4620.0
6579,0,,,,,,,,,,...,,,,,,,,,,
1096,2009.08.29,29-Aug-2009,2009.0,Unprovoked,SOUTH AFRICA,Western Cape Province,Glentana,Surfing,Gerhard van Zyl,M,...,Y,15h30,White shark,"Cape Argus, 8/30/2009, p.1",2009.08.29-VanZyl.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,2009.08.29,2009.08.29,5207.0
6043,1847.03.11,11-Mar-1847,1847.0,Sea Disaster,AUSTRALIA,Queensland,Moreton Bay,Wreck of the Sovereign,Spicer,M,...,N,,,"The Queenslander, 3/12/1921",1847.03.11-Spicer.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1847.03.11,1847.03.11,260.0
7762,0,,,,,,,,,,...,,,,,,,,,,


# B. The Hypotheses to test

It has been decided that the hypotheses to test are the following:

- Hypothesis 1: Shark attacks' fatality rate has fallen over time
- Hypothesis 2: Shark attacks' fatality rate is negatively correlated with the level of development of the place where the attack takes place

# C. Reshaping and cleaning the dataset according to hypotheses to test

The formulated hypotheses imply that fatality will be our main variable of interest. Besides that, included variable which are to be inspected are those related to time and location. These are respectively "Date", "Year" "Time" plus those related to Case Number on the one hand, and "Area" "Location" and "Country" on the other.

In [14]:
#first of all, all the variable which are not of interest will be discarded
columns_to_discard = ['Type', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Time','Species ', 'Investigator or Source', 
                      'pdf', 'href formula', 'href','original order']

attacks_1_3= attacks_1_2.drop(columns_to_discard, axis=1)


In [15]:
#taking a look at this new refined df
print(attacks_1_3.shape)
attacks_1_3.sample(10)

(8703, 9)


Unnamed: 0,Case Number,Date,Year,Country,Area,Location,Fatal (Y/N),Case Number.1,Case Number.2
4800,1939.02.26.a,26-Feb-1939,1939.0,NEW ZEALAND,North Island,Matakana River Mouth,N,1939.02.26.a,1939.02.26.a
2853,1985.07.00,Mid Jul-1985 or mid Jul-1986,1985.0,ITALY,Sicily,Punta Secca,N,1985.07.00,1985.07.00
3930,1961.01.06.b,06-Jan-1961,1961.0,ATLANTIC OCEAN,9.35N 79.35W,"East of La Grande Island, North of Panama Canal",Y,1961.01.06.b,1961.01.06.b
3809,1962.07.07,07-Jul-1962,1962.0,USA,Georgia,"Jekyll Island, Glynn County",N,1962.07.07,1962.07.07
5118,1928.01.00,Jan-1928,1928.0,SOUTH AFRICA,Eastern Cape Province,Kei River mouth,N,1928.01.00,1928.01.00
6765,0,,,,,,,,
6094,1829.06.10,10-Jun-1829,1829.0,USA,New Jersey,"Sandy Hook Bay, Highlands, Monmouth County",Y,1829.06.10,1829.06.10
3154,1977.03.13.a,13-Mar-1977,1977.0,AUSTRALIA,Queensland,Near Moreton Island in Moreton Bay,Y,1977.03.13.a,1977.03.13.a
7912,0,,,,,,,,
4366,1953.09.03.R,Reported 03-Sep-1953,1953.0,USA,New York,Rockaway Beach,N,1953.09.03.R,1953.09.03.R


In [16]:
#resetting the df's index
attacks_1_3_1 = attacks_1_3.reset_index()
display(attacks_1_3_1) 

Unnamed: 0,index,Case Number,Date,Year,Country,Area,Location,Fatal (Y/N),Case Number.1,Case Number.2
0,0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N,2018.06.25,2018.06.25
1,1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N,2018.06.18,2018.06.18
2,2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N,2018.06.09,2018.06.09
3,3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N,2018.06.08,2018.06.08
4,4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N,2018.06.04,2018.06.04
...,...,...,...,...,...,...,...,...,...,...
8698,8698,0,,,,,,,,
8699,8699,0,,,,,,,,
8700,8700,0,,,,,,,,
8701,8701,0,,,,,,,,


In [17]:
#now lets delete all rows where the variable "Fatal (Y/N)" takes a null value, for that a specific function has been defined
# .isna().sum() is employed to check if rows including null values for the specified column have been indeed removed.
from src import row_nan_out

attacks_1_3_2 = row_nan_out(attacks_1_3_1,"Fatal (Y/N)")
print(attacks_1_3_2.isna().sum())


Se encontaron valores nulos a eliminar en los datos.
index              0
Case Number        1
Date               0
Year               1
Country           45
Area             404
Location         483
Fatal (Y/N)        0
Case Number.1      0
Case Number.2      0
dtype: int64


In [18]:
#resetting the index and removing redundant past indices
attacks_1_3_3 = attacks_1_3_2.reset_index()
attacks_1_3_4 = attacks_1_3_3.drop(["index", "level_0"], axis=1)

### C.1 Fatality rate across time

In [19]:
#the space in the column name "Case Number" is removed here asit caused problem further down the code
attacks_1_3_4 = attacks_1_3_4.rename(columns={"Case Number" : "CaseNumber"})

In [20]:
attacks_1_3_5 = attacks_1_3_4.copy()

#Comparing Case Number.1 and Case Number.2 as they seem to be largely the same...
#The rate of disagreement between the two is 0,29%, case number.2 is dropped as a consequence
#the column created for the comparisson is also deleted

attacks_1_3_5['Cases_Check'] = np.where((attacks_1_3_5["Case Number.1"] == attacks_1_3_5["Case Number.2"]), True , False)
print(attacks_1_3_5['Cases_Check'].value_counts())
attacks_1_3_5 = attacks_1_3_5.drop(["Case Number.2", "Cases_Check"], axis=1)

#the same comparisson is performed for Case Number.1 and Case Number
#with a rate of disagreement of 0,36% Case Number.1 is dropped from the dataframe too
#the column created for the comparisson is also deleted again

attacks_1_3_5['Cases_Check'] = np.where((attacks_1_3_5["Case Number.1"] == attacks_1_3_5["CaseNumber"]), True , False)
print(attacks_1_3_5['Cases_Check'].value_counts())
attacks_1_3_6 = attacks_1_3_5.drop(["Case Number.1", "Cases_Check"], axis=1)

display(attacks_1_3_6)

True     5746
False      17
Name: Cases_Check, dtype: int64
True     5742
False      21
Name: Cases_Check, dtype: int64


Unnamed: 0,CaseNumber,Date,Year,Country,Area,Location,Fatal (Y/N)
0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N
1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N
2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N
3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N
4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N
...,...,...,...,...,...,...,...
5758,ND.0005,Before 1903,0.0,AUSTRALIA,Western Australia,Roebuck Bay,Y
5759,ND.0004,Before 1903,0.0,AUSTRALIA,Western Australia,,Y
5760,ND.0003,1900-1905,0.0,USA,North Carolina,Ocracoke Inlet,Y
5761,ND.0002,1883-1889,0.0,PANAMA,,"Panama Bay 8ºN, 79ºW",Y


In [21]:
#The function first_year is applied to extract the substring containing year information 
#for values in columns Date & CaseNumber

from src import first_year
attacks_1_3_6["Case_Year"] = attacks_1_3_6.CaseNumber.apply(first_year)
attacks_1_3_6["Date_Year"] = attacks_1_3_6.Date.apply(first_year)
display(attacks_1_3_6)


Unnamed: 0,CaseNumber,Date,Year,Country,Area,Location,Fatal (Y/N),Case_Year,Date_Year
0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N,2018,2018
1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N,2018,2018
2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N,2018,2018
3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N,2018,2018
4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N,2018,2018
...,...,...,...,...,...,...,...,...,...
5758,ND.0005,Before 1903,0.0,AUSTRALIA,Western Australia,Roebuck Bay,Y,0005,1903
5759,ND.0004,Before 1903,0.0,AUSTRALIA,Western Australia,,Y,0004,1903
5760,ND.0003,1900-1905,0.0,USA,North Carolina,Ocracoke Inlet,Y,0003,1900
5761,ND.0002,1883-1889,0.0,PANAMA,,"Panama Bay 8ºN, 79ºW",Y,0002,1883


In [22]:
#analyzing descriptive statistics for Year Variables in order to decide which range of years to focus on
#with at least 75% of all year observations in the sample for the three variables which inform about years taking place
#after 1945, this will be selected as starting point for the sample. Besides this, 1945 is also an interesting point 
#for historical and economic reasons.
#1465 observations have been consequently discarded


attacks_1_3_6 = attacks_1_3_6.astype({"Case_Year": float})
attacks_1_3_6 = attacks_1_3_6.astype({"Date_Year": float})
print(attacks_1_3_6["Date_Year"].describe())
print(attacks_1_3_6["Case_Year"].describe())
print(attacks_1_3_6["Year"].describe())

attacks_1_3_7 = attacks_1_3_6[attacks_1_3_6.Date_Year > 1945]
attacks_1_3_7 = attacks_1_3_7[attacks_1_3_7.Case_Year > 1945]
attacks_1_3_7 = attacks_1_3_7[attacks_1_3_7.Year > 1945]
attacks_1_3_7.shape


count    5744.000000
mean     1967.455084
std        48.007509
min      1543.000000
25%      1945.000000
50%      1979.000000
75%      2005.000000
max      2018.000000
Name: Date_Year, dtype: float64
count    5762.000000
mean     1926.352135
std       281.706297
min         0.000000
25%      1942.000000
50%      1978.000000
75%      2005.000000
max      2018.000000
Name: Case_Year, dtype: float64
count    5762.000000
mean     1924.575842
std       292.236459
min         0.000000
25%      1942.000000
50%      1978.000000
75%      2005.000000
max      2018.000000
Name: Year, dtype: float64


(4240, 9)

In [23]:
#CHECKING:
#Number of nans in df: none of them is in th year related variables so the present nans are not a cause of concern for now
#Number of discrepancies between Year related columns (max discrepancy rate = 0,23%)
#Lastly the unique values of year related columns are printed for a visual inspection 

print(attacks_1_3_7.isna().sum())

attacks_1_3_7['Cases_Check_1'] = np.where((attacks_1_3_7["Case_Year"] == attacks_1_3_7["Date_Year"]), True , False)
print(attacks_1_3_7['Cases_Check_1'].value_counts())
attacks_1_3_7['Cases_Check_2'] = np.where((attacks_1_3_7["Case_Year"] == attacks_1_3_7["Year"]), True , False)
print(attacks_1_3_7['Cases_Check_2'].value_counts())
attacks_1_3_7['Cases_Check_3'] = np.where((attacks_1_3_7["Date_Year"] == attacks_1_3_7["Year"]), True , False)
print(attacks_1_3_7['Cases_Check_3'].value_counts())

print(attacks_1_3_7.Case_Year.unique())
print(attacks_1_3_7.Date_Year.unique())
print(attacks_1_3_7.Year.unique())

CaseNumber       0
Date             0
Year             0
Country         12
Area           199
Location       247
Fatal (Y/N)      0
Case_Year        0
Date_Year        0
dtype: int64
True     4231
False       9
Name: Cases_Check_1, dtype: int64
True     4239
False       1
Name: Cases_Check_2, dtype: int64
True     4230
False      10
Name: Cases_Check_3, dtype: int64
[2018. 2017. 2016. 2015. 2014. 2013. 2012. 2011. 2010. 2009. 2008. 2007.
 2006. 2005. 2004. 2003. 2002. 2001. 2000. 1999. 1998. 1997. 1996. 1995.
 1984. 1994. 1993. 1992. 1991. 1990. 1989. 1988. 1987. 1986. 1985. 1983.
 1982. 1981. 1980. 1979. 1978. 1977. 1976. 1975. 1974. 1973. 1972. 1971.
 1970. 1969. 1968. 1967. 1966. 1965. 1964. 1963. 1962. 1961. 1960. 1959.
 1958. 1957. 1956. 1955. 1954. 1953. 1952. 1951. 1950. 1949. 1948. 1947.
 1946.]
[2018. 2017. 2016. 2015. 2014. 2013. 2012. 2011. 2008. 2010. 2009. 2006.
 2007. 2005. 2004. 2003. 2002. 2001. 2000. 1999. 1998. 1997. 1996. 1995.
 1984. 1994. 1993. 1992. 1991. 1990. 1

In [24]:
#row where year related values do not coincide are dropped aiming at 0% discrpenacy ratio and thus maximum accuracy in
#year measurement

attacks_1_3_8 = attacks_1_3_7[attacks_1_3_7["Date_Year"] == attacks_1_3_7["Year"]]

attacks_1_3_8['Cases_Check_1'] = np.where((attacks_1_3_8["Case_Year"] == attacks_1_3_8["Date_Year"]), True , False)
print(attacks_1_3_8['Cases_Check_1'].value_counts())
attacks_1_3_8['Cases_Check_2'] = np.where((attacks_1_3_8["Case_Year"] == attacks_1_3_8["Year"]), True , False)
print(attacks_1_3_8['Cases_Check_2'].value_counts())
attacks_1_3_8['Cases_Check_3'] = np.where((attacks_1_3_8["Date_Year"] == attacks_1_3_8["Year"]), True , False)
print(attacks_1_3_8['Cases_Check_3'].value_counts())

#after checking that the discrepancy rate is effectively down to 0% all redundant ando/or innecesary columns are dropped

display(attacks_1_3_8)
attacks_1_3_9 = attacks_1_3_8.drop(["CaseNumber", "Date", "Case_Year", "Date_Year", 
                                    "Cases_Check_1", "Cases_Check_3", "Cases_Check_2"], axis=1)

True    4230
Name: Cases_Check_1, dtype: int64
True    4230
Name: Cases_Check_2, dtype: int64
True    4230
Name: Cases_Check_3, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks_1_3_8['Cases_Check_1'] = np.where((attacks_1_3_8["Case_Year"] == attacks_1_3_8["Date_Year"]), True , False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  attacks_1_3_8['Cases_Check_2'] = np.where((attacks_1_3_8["Case_Year"] == attacks_1_3_8["Year"]), True , False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a

Unnamed: 0,CaseNumber,Date,Year,Country,Area,Location,Fatal (Y/N),Case_Year,Date_Year,Cases_Check_1,Cases_Check_2,Cases_Check_3
0,2018.06.25,25-Jun-2018,2018.0,USA,California,"Oceanside, San Diego County",N,2018.0,2018.0,True,True,True
1,2018.06.18,18-Jun-2018,2018.0,USA,Georgia,"St. Simon Island, Glynn County",N,2018.0,2018.0,True,True,True
2,2018.06.09,09-Jun-2018,2018.0,USA,Hawaii,"Habush, Oahu",N,2018.0,2018.0,True,True,True
3,2018.06.08,08-Jun-2018,2018.0,AUSTRALIA,New South Wales,Arrawarra Headland,N,2018.0,2018.0,True,True,True
4,2018.06.04,04-Jun-2018,2018.0,MEXICO,Colima,La Ticla,N,2018.0,2018.0,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
4236,1946.01.05,05-Jan-1946,1946.0,AUSTRALIA,New South Wales,"Oatley Bay near Como, Georges River",Y,1946.0,1946.0,True,True,True
4237,1946.01.01,01-Jan-1946,1946.0,SOUTH AFRICA,Eastern Cape Province,"Pollock Beach, Port Elizabeth",N,1946.0,1946.0,True,True,True
4238,1946.00.00.c,1946,1946.0,SOUTH AFRICA,Western Cape Province,Plettenberg Bay,N,1946.0,1946.0,True,True,True
4239,1946.00.00.b,1946,1946.0,SOUTH AFRICA,Western Cape Province,Table Bay,N,1946.0,1946.0,True,True,True


In [29]:
#changing "Year" (Now our main variable for measuring time) to integer to better suit the nature of the variable
#and then displaying verison 1_4 of the data frame for visual inspection.
attacks_1_4 = attacks_1_3_9.astype({"Year": int})
attacks_1_4 = attacks_1_4.reset_index()
attacks_1_4 = attacks_1_4.drop(["index"], axis=1)
display(attacks_1_4)
attacks_1_4.shape

Unnamed: 0,Year,Country,Area,Location,Fatal (Y/N)
0,2018,USA,California,"Oceanside, San Diego County",N
1,2018,USA,Georgia,"St. Simon Island, Glynn County",N
2,2018,USA,Hawaii,"Habush, Oahu",N
3,2018,AUSTRALIA,New South Wales,Arrawarra Headland,N
4,2018,MEXICO,Colima,La Ticla,N
...,...,...,...,...,...
4225,1946,AUSTRALIA,New South Wales,"Oatley Bay near Como, Georges River",Y
4226,1946,SOUTH AFRICA,Eastern Cape Province,"Pollock Beach, Port Elizabeth",N
4227,1946,SOUTH AFRICA,Western Cape Province,Plettenberg Bay,N
4228,1946,SOUTH AFRICA,Western Cape Province,Table Bay,N


(4230, 5)

In [33]:
#preparing to clean "Fatal (Y/N)" column
print(attacks_1_4["Fatal (Y/N)"].unique())
print(attacks_1_4["Fatal (Y/N)"].value_counts())

attacks_1_4 = attacks_1_4.drop(["index"], axis=1)


['N' 'Y' 'M' 'UNKNOWN' '2017' ' N']
N          3515
Y           662
UNKNOWN      44
 N            7
M             1
2017          1
Name: Fatal (Y/N), dtype: int64


In [35]:
#Values like for "Fatal (Y/N)" which offer no clue of what could be their true value between Y/N are discarded 
#(i.e. "Unknown" & "2017"). Then Values whose input is not "N" or "Y" but who arguably could be related to one of those
#are substituted (i.e. ' N' & 'M'). Note: 'M' is located right next to 'N' in the qwerty keyboard which makes pretty
#likely that its true intended value is 'N' but this eas not the case due to a typo at the time of inputting the data.

attacks_1_4_1 = attacks_1_4[attacks_1_4["Fatal (Y/N)"] != "UNKNOWN"]
attacks_1_4_1 = attacks_1_4_1[attacks_1_4_1["Fatal (Y/N)"] != "2017"]
attacks_1_4_1['Fatal (Y/N)'] = attacks_1_4_1['Fatal (Y/N)'].replace([' N', 'M'],'N')
print(attacks_1_4_1["Fatal (Y/N)"].unique())
print(attacks_1_4_1["Fatal (Y/N)"].value_counts())

['N' 'Y']
N    3523
Y     662
Name: Fatal (Y/N), dtype: int64


Int64Index([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,
            ...
            4220, 4221, 4222, 4223, 4224, 4225, 4226, 4227, 4228, 4229],
           dtype='int64', length=4185)

### Z. Ideas, experiemntos e información variada (En Español)

In [None]:
#df version index and description:

#attacks -> Raw data
#attacks_1_1 -> Rows where all values are Nulls are deletes
#attacks_1_2 -> Columns "Unnamed: 22" & "Unnamed: 23" deleted
#attacks_1_3 -> Columns which are not of interest to the hypotheses test are discarded
    #attacks_1_3_1 -> equal to attacks_1_3 but with resetted index column
    #attacks_1_3_2 -> rows for which the values of column "Fatal (Y/N)" is null are removed
    #attacks_1_3_3 -> bridge version between 1_3_2 & 1_3_4
    #attacks_1_3_4 -> version 1_3_2 with resetted index
    #attacks_1_3_5 -> version 1_3_4 but with modified a column name
    #attacks_1_3_6 -> version 1_3_5 without columns Case Number.1 & Case Number.2 as they have been identified as redundant
    #attacks_1_3_7 -> All observations for which the column Date_Year value is lower than 1945 have been discarded.
    #attacks_1_3_8 -> Discrepancies between Year - related columns eliminated
    #attacks_1_3_9 -> Given the lack of discrepancies on year related columns, all redundant ones and check 
                    #associated values are eliminated
#attacks_1_4 -> Time related values are clean and triple-checked for maximun precision within the given posibilities.
    #attacks_1_4_1 -> Column "Fatal (Y/N)" has been cleaned
                    
 

In [None]:
#nombres "ocupados"

#columns_to_discard

In [None]:
#sites of interest

#https://www.sharkattackfile.net/species.htm
#https://es.wikipedia.org/wiki/Selachimorpha
#https://es.wikipedia.org/wiki/Ataque_de_tibur%C3%B3n