## Checking if attacks happen more in summer than other seasons

In [1]:
import numpy as np
import pandas as pd

In [2]:
df= pd.read_csv("data/attacks.csv", encoding='cp1252')

In [3]:
df.shape

(25723, 24)

In [4]:
df.info() #check out the info of the table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

### dropping all duplicated and empty rows

In [5]:
df.drop_duplicates(inplace= True) #remove all the values duplicated

In [6]:
df.shape

(6312, 24)

In [7]:
df.dropna(how= 'all',inplace= True) #removed all the rows which all the values were NaN.

In [8]:
df.shape

(6311, 24)

In [9]:
df.sample()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
5863,1877.00.00,Before 1878,1877.0,Unprovoked,INDIA,Hoogly River,Near Calcutta,,Indian,M,...,,"J. Fayrer, M.D. cited in F. Day, The Fishes of...",1877.00.00-Calcutta.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1877.00.00,1877.00.00,440.0,,


### Eliminating all the columns are not useful for the hypothesis and rows without the all data it's necesary 

In [10]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [11]:
df.drop (labels= ['Case Number', 'Type', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2', 'Unnamed: 22', 'Unnamed: 23', 'original order', 'Area', 'Location'], axis= 1, inplace= True)

In [12]:
df.head()

Unnamed: 0,Date,Year,Country
0,25-Jun-2018,2018.0,USA
1,18-Jun-2018,2018.0,USA
2,09-Jun-2018,2018.0,USA
3,08-Jun-2018,2018.0,AUSTRALIA
4,04-Jun-2018,2018.0,MEXICO


### Deleting all the incidents happened before 1946 when the World Tourism Organization was created

In [13]:
df=df[df["Year"]>=1946] #remove the row before the year 

In [14]:
df['Year'] = df['Year'].astype('str') # remove the .0 format to have a proper year
df['Year'] = df['Year'].map(lambda x: x.rstrip('.0'))
df['Year'] = df['Year'].astype('int')

In [15]:
df.dropna(subset = ['Country'], inplace=True) # remove all the empty values for country

In [16]:
df['Country'] = df['Country'].apply(lambda x: x.lower())#lowercase to compare later with a list of countries

In [17]:
df['Country'] =df['Country'].str.strip()

In [18]:
df['Country'] = df['Country'].replace(["usa"], 'united states')
df['Country'] = df['Country'].replace(["sudan?"], 'sudan')
df['Country'] = df['Country'].replace(["united arab emirates (uae)"], 'united arab emirates')

In [19]:
df.shape

(4598, 3)

### Creating a list of countries of the world separing them in between the north and south

- can be created as a function

In [20]:
l_country= pd.read_csv('data/list_country.csv') #we can see there's a latitude empty, so we remove it
l_country.dropna(how= 'any',inplace= True)

In [21]:
l_country["hemisphere"]=l_country["latitude"].apply(lambda x : "north" if x>0 else "south")
l_country['name'] = l_country['name'].apply(lambda x: x.lower())

In [22]:
l_country.drop (labels= ['latitude','longitude'], axis= 1, inplace= True)


In [23]:
l_country.rename(columns= {"name": "Country"})

Unnamed: 0,Country,hemisphere
0,andorra,north
1,united arab emirates,north
2,afghanistan,north
3,antigua and barbuda,north
4,anguilla,north
...,...,...
240,yemen,north
241,mayotte,south
242,south africa,south
243,zambia,south


### Comparing the df's

- Checking both lists can find there are some discrepancies how the countries are described. so, we gonna implement some changes to adjust as much as many countries in order to match both lists

### Creating the seasons according to the hemisphere

In [24]:
df

Unnamed: 0,Date,Year,Country
0,25-Jun-2018,2018,united states
1,18-Jun-2018,2018,united states
2,09-Jun-2018,2018,united states
3,08-Jun-2018,2018,australia
4,04-Jun-2018,2018,mexico
...,...,...,...
4610,05-Jan-1946,1946,australia
4611,01-Jan-1946,1946,south africa
4612,1946,1946,south africa
4613,1946,1946,south africa
