## Checking if attacks happen more in summer than other seasons

In [1]:
import numpy as np
import pandas as pd

In [2]:
df= pd.read_csv("data/attacks.csv", encoding='cp1252')

In [3]:
df.shape

(25723, 24)

In [4]:
df.info() #check out the info of the table

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25723 entries, 0 to 25722
Data columns (total 24 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Case Number             8702 non-null   object 
 1   Date                    6302 non-null   object 
 2   Year                    6300 non-null   float64
 3   Type                    6298 non-null   object 
 4   Country                 6252 non-null   object 
 5   Area                    5847 non-null   object 
 6   Location                5762 non-null   object 
 7   Activity                5758 non-null   object 
 8   Name                    6092 non-null   object 
 9   Sex                     5737 non-null   object 
 10  Age                     3471 non-null   object 
 11  Injury                  6274 non-null   object 
 12  Fatal (Y/N)             5763 non-null   object 
 13  Time                    2948 non-null   object 
 14  Species                 3464 non-null 

### dropping all duplicated and empty rows
- create a fucntion called general cleaning

In [5]:
df.drop_duplicates(inplace= True) #remove all the values duplicated

In [6]:
df.shape

(6312, 24)

In [7]:
df.dropna(how= 'all',inplace= True) #removed all the rows which all the values were NaN.

In [8]:
df.shape

(6311, 24)

In [9]:
df.sample()

Unnamed: 0,Case Number,Date,Year,Type,Country,Area,Location,Activity,Name,Sex,...,Species,Investigator or Source,pdf,href formula,href,Case Number.1,Case Number.2,original order,Unnamed: 22,Unnamed: 23
3851,1962.01.11.a,11-Jan-1962,1962.0,Unprovoked,NEW ZEALAND,South Island,"Fairdown Beach, 5 miles north of Westport",Surf fishing,Mrs. Beryl Grant,F,...,"36"" shark","R. D. Weeks, GSAF; Dr. C. Foote; The Evening P...",1962.01.11.a-BerylGrant.pdf,http://sharkattackfile.net/spreadsheets/pdf_di...,http://sharkattackfile.net/spreadsheets/pdf_di...,1962.01.11.a,1962.01.11.a,2452.0,,


### Eliminating all the columns are not useful for the hypothesis and rows without the all data it's necesary 
- cleaning data not useful

In [10]:
df.columns

Index(['Case Number', 'Date', 'Year', 'Type', 'Country', 'Area', 'Location',
       'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time',
       'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href',
       'Case Number.1', 'Case Number.2', 'original order', 'Unnamed: 22',
       'Unnamed: 23'],
      dtype='object')

In [11]:
df.drop (labels= ['Case Number', 'Type', 'Activity', 'Name', 'Sex ', 'Age', 'Injury', 'Fatal (Y/N)', 'Time', 'Species ', 'Investigator or Source', 'pdf', 'href formula', 'href', 'Case Number.1', 'Case Number.2', 'Unnamed: 22', 'Unnamed: 23', 'original order', 'Area', 'Location'], axis= 1, inplace= True)

In [12]:
df.head()

Unnamed: 0,Date,Year,Country
0,25-Jun-2018,2018.0,USA
1,18-Jun-2018,2018.0,USA
2,09-Jun-2018,2018.0,USA
3,08-Jun-2018,2018.0,AUSTRALIA
4,04-Jun-2018,2018.0,MEXICO


### Deleting all the incidents happened before 1946 when the World Tourism Organization was created

In [13]:
df=df[df["Year"]>=1946] #remove the row before the year 

In [14]:
df['Year'] = df['Year'].astype('str') # remove the .0 format to have a proper year
df['Year'] = df['Year'].map(lambda x: x.rstrip('.0'))
df['Year'] = df['Year'].astype('int')

In [15]:
df.dropna(subset = ['Country'], inplace=True) # remove all the empty values for country

In [16]:
df['Country'] = df['Country'].apply(lambda x: x.lower())#lowercase to compare later with a list of countries

In [17]:
df['Country'] =df['Country'].str.strip()

In [18]:
df['Country'] = df['Country'].replace(["usa"], 'united states')
df['Country'] = df['Country'].replace(["sudan?"], 'sudan')
df['Country'] = df['Country'].replace(["united arab emirates (uae)"], 'united arab emirates')


### Creating a list of countries of the world separing them in between the north and south

- can be created as a function

In [19]:
l_country= pd.read_csv('data/list_country.csv') #we can see there's a latitude empty, so we remove it
l_country.dropna(how= 'any',inplace= True)

In [20]:
l_country["hemisphere"]=l_country["latitude"].apply(lambda x : "north" if x>0 else "south")
l_country['name'] = l_country['name'].apply(lambda x: x.lower())

In [21]:
l_country.drop (labels= ['latitude','longitude'], axis= 1, inplace= True)

In [22]:
l_country= l_country[l_country.hemisphere == "south"]
south_list= l_country['name'].values.tolist()

In [23]:
l_country.head()

Unnamed: 0,name,hemisphere
8,angola,south
9,antarctica,south
10,argentina,south
11,american samoa,south
13,australia,south


In [24]:
l_country.reset_index(drop=True)

Unnamed: 0,name,hemisphere
0,angola,south
1,antarctica,south
2,argentina,south
3,american samoa,south
4,australia,south
...,...,...
58,samoa,south
59,mayotte,south
60,south africa,south
61,zambia,south


### Comparing the df's

- Checking both lists can find there are some discrepancies how the countries are described. so, we gonna implement some changes to adjust as much as many countries in order to match both lists

In [25]:
ls_country_df= df['Country'].values.tolist()

In [26]:
df.set_index(['Country'], inplace=True) 

In [27]:
lst_n_s=[]
for i in ls_country_df:
    if i in south_list:
        lst_n_s.append("south")
    else:
        lst_n_s.append("north")

In [28]:
dict_={"hemisphere":lst_n_s, "Country": ls_country_df}

In [29]:
df_hemisphere=pd.DataFrame(dict_)

In [30]:
df_hemisphere.set_index(['Country'], inplace=True) 

In [31]:
df_concat = pd.concat([df, df_hemisphere], axis=1)

In [32]:
df_concat

Unnamed: 0_level_0,Date,Year,hemisphere
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
united states,25-Jun-2018,2018,north
united states,18-Jun-2018,2018,north
united states,09-Jun-2018,2018,north
australia,08-Jun-2018,2018,south
mexico,04-Jun-2018,2018,north
...,...,...,...
australia,05-Jan-1946,1946,south
south africa,01-Jan-1946,1946,south
south africa,1946,1946,south
south africa,1946,1946,south


### Creating the seasons according to the hemisphere

In [33]:
# regex to check
year_dict={
    1 : "jan",
    2 : "feb",
    3 : "mar",
    4 : "apr",
    5 : "may",
    6 : "jun",
    7 : "jul",
    8 : "aug",
    9 : "sept",
    10: "oct",
    11: "nov",
    12: "dec"
}

In [34]:
import re
df_concat.Date.replace('(\d)', (''), regex= True, inplace= True)

In [35]:
df_concat.Date.str.strip()

Country
united states    -Jun-
united states    -Jun-
united states    -Jun-
australia        -Jun-
mexico           -Jun-
                 ...  
australia        -Jan-
south africa     -Jan-
south africa          
south africa          
persian gulf          
Name: Date, Length: 4598, dtype: object

In [41]:
#df_concat.Date.replace('[^a-z]', (''), regex= True, inplace= False)

In [42]:
df_concat['Date'].str.lower()

Country
united states    -jun-
united states    -jun-
united states    -jun-
australia        -jun-
mexico           -jun-
                 ...  
australia        -jan-
south africa     -jan-
south africa          
south africa          
persian gulf          
Name: Date, Length: 4598, dtype: object

In [46]:
df_concat.Date=df_concat.Date.re.findall('[a-z]{3}')

AttributeError: 'Series' object has no attribute 're'

In [39]:
def find_season(month, hemisphere):
    if hemisphere == 'south':
        season_month_south = {
            12:'Summer', 1:'Summer', 2:'Summer',
            3:'Autumn', 4:'Autumn', 5:'Autumn',
            6:'Winter', 7:'Winter', 8:'Winter',
            9:'Spring', 10:'Spring', 11:'Spring'}
        return season_month_south.get(month)
        
    elif hemisphere == 'north':
        season_month_north = {
            12:'Winter', 1:'Winter', 2:'Winter',
            3:'Spring', 4:'Spring', 5:'Spring',
            6:'Summer', 7:'Summer', 8:'Summer',
            9:'Autumn', 10:'Autumn', 11:'Autumn'}
        return season_month_north.get(month)