In [1]:
import numpy as np
import pandas as pd

In [2]:
df= pd.read_csv("data/attacks.csv", encoding='cp1252')

In [3]:
df.shape

(25723, 24)

In [4]:
df.info() #check out the info of the table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

### dropping all duplicated and empty rows

In [5]:
df.drop_duplicates(inplace= True) #remove all the values duplicated

In [6]:
df.shape

(6312, 24)

In [7]:
df.dropna(how= 'all',inplace= True) #removed all the rows which all the values were NaN.

In [8]:
df.shape

(6311, 24)

In [9]:
df.sample()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
3582,1966.04.08,08-Apr-1966,1966.0,Unprovoked,USA,Puerto Rico,,Swimming,John Seaver,M,...,,H.D. Baldridge (1994) SAF Case #1483,1966.04.08-NV-Seaver.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1966.04.08,1966.04.08,2721.0,,


### Eliminating all the columns are not useful for the hypothesis and rows without the all data it's necesary 

In [10]:
list(df.columns)

['Case Number',
 'Date',
 'Year',
 'Type',
 'Country',
 'Area',
 'Location',
 'Activity',
 'Name',
 'Sex ',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species ',
 'Investigator or Source',
 'pdf',
 'href formula',
 'href',
 'Case Number.1',
 'Case Number.2',
 'original order',
 'Unnamed: 22',
 'Unnamed: 23']

In [11]:
df.drop (labels= ['Case Number',
 'Type',
 'Activity',
 'Name',
 'Sex ',
 'Age',
 'Injury',
 'Fatal (Y/N)',
 'Time',
 'Species ',
 'Investigator or Source',
 'pdf',
 'href formula',
 'href',
 'Case Number.1',
 'Case Number.2',
 'Unnamed: 22',
 'Unnamed: 23',
 'original order'], axis= 1, inplace= True)

### Deleting all the incidents happened before 1946 when the World Tourism Organization was created

In [12]:
df=df[df["Year"]>=1946] #remove the row before the year 

In [13]:
df['Year'] = df['Year'].astype('str') # remove the .0 format to have a proper year
df['Year'] = df['Year'].map(lambda x: x.rstrip('.0'))
df['Year'] = df['Year'].astype('int')

In [14]:
df.dropna(subset = ['Country'], inplace=True) # remove all the empty values for country

In [15]:
df['Country'] = df['Country'].apply(lambda x: x.lower())#lowercase to compare later with a list of countries

In [16]:
df['Country'] =df['Country'].str.strip()
df['Country'].unique()

array(['usa', 'australia', 'mexico', 'brazil', 'england', 'south africa',
       'thailand', 'costa rica', 'maldives', 'bahamas', 'new caledonia',
       'ecuador', 'malaysia', 'libya', 'cuba', 'mauritius', 'new zealand',
       'spain', 'samoa', 'solomon islands', 'japan', 'egypt',
       'st helena, british overseas territory', 'comoros', 'reunion',
       'french polynesia', 'united kingdom', 'united arab emirates',
       'philippines', 'indonesia', 'china', 'columbia', 'cape verde',
       'fiji', 'dominican republic', 'cayman islands', 'aruba',
       'mozambique', 'puerto rico', 'italy', 'atlantic ocean', 'greece',
       'st. martin', 'france', 'papua new guinea', 'trinidad & tobago',
       'kiribati', 'israel', 'diego garcia', 'taiwan', 'jamaica',
       'palestinian territories', 'guam', 'seychelles', 'belize',
       'nigeria', 'tonga', 'scotland', 'canada', 'croatia',
       'saudi arabia', 'chile', 'antigua', 'kenya', 'russia',
       'turks & caicos', 'united arab emirat

In [17]:
df['Country'] = df['Country'].replace(["usa"], 'united states')
df['Country'] = df['Country'].replace(["sudan?"], 'sudan')
df['Country'] = df['Country'].replace(["united arab emirates (uae)"], 'united arab emirates')

In [31]:
df.set_index('Country', inplace=True)

In [32]:
df.shape

(4598, 4)

### Creating a list of countries of the world separing them in between the north and south

- can be created as a function

In [19]:
l_country= pd.read_csv('data/list_country.csv') #we can see there's a latitude empty, so we remove it
l_country.dropna(how= 'any',inplace= True)

In [20]:
l_country["hemisphere"]=l_country["latitude"].apply(lambda x : "north" if x>0 else "south")
l_country['name'] = l_country['name'].apply(lambda x: x.lower())
l_country

Unnamed: 0,latitude,longitude,name,hemisphere
0,42.546245,1.601554,andorra,north
1,23.424076,53.847818,united arab emirates,north
2,33.939110,67.709953,afghanistan,north
3,17.060816,-61.796428,antigua and barbuda,north
4,18.220554,-63.068615,anguilla,north
...,...,...,...,...
240,15.552727,48.516388,yemen,north
241,-12.827500,45.166244,mayotte,south
242,-30.559482,22.937506,south africa,south
243,-13.133897,27.849332,zambia,south


In [21]:
l_country.drop (labels= ['latitude','longitude'], axis= 1, inplace= True)


In [29]:
l_country.set_index('name', inplace=True)

In [30]:
l_country

Unnamed: 0_level_0,hemisphere
name,Unnamed: 1_level_1
andorra,north
united arab emirates,north
afghanistan,north
antigua and barbuda,north
anguilla,north
...,...
yemen,north
mayotte,south
south africa,south
zambia,south


### Comparing the df's

- Checking both lists can find there are some discrepancies how the countries are described. so, we gonna implement some changes to adjust as much as many countries in order to match both lists

In [34]:
for index in df:
    if index in l_country:
        print (i)