In [34]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from sklearn.preprocessing import LabelEncoder  
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

pd.options.display.max_columns = None

In [29]:
df = pd.read_csv("attacks_limpiando_con_coordenadas.csv", index_col = 0)
df.head()

Unnamed: 0,year,type,country,age,species_,fecha_limpia,fatal,sex,latitud,longitud
0,2018,Boating,usa,57.0,White shark,Jun,N,F,39.7837304,-100.445882
1,2018,Unprovoked,usa,11.0,Unespecific,Jun,N,F,39.7837304,-100.445882
2,2018,Invalid,usa,48.0,Unespecific,Jun,N,M,39.7837304,-100.445882
3,2018,Unprovoked,australia,27.0,Unespecific,Jun,N,M,-24.7761086,134.755
4,2018,Provoked,mexico,27.0,Tiger shark,Jun,N,M,23.6585116,-102.0077097


EJERCICIO 1: Lo primero que debemos hacer es identificar cuáles de nuestras columnas es categóricas. Extraed el nombre de las columnas que contienen variables categóricas.

In [30]:
df_categoricas = df.select_dtypes(include=[object])
df_categoricas.tail(2)

Unnamed: 0,type,country,species_,fecha_limpia,fatal,sex,latitud,longitud
6216,Unprovoked,panama,Unespecific,Unknown,Y,M,8.559559,-81.1308434
6217,Unprovoked,ceylon (sri lanka),Unespecific,Unknown,Y,M,7.5554942,80.7137847


In [47]:
df['latitud'].unique()

array(['39.7837304', '-24.7761086', '23.6585116', '-10.3333333',
       '52.5310214', '-28.8166236', '14.8971921', '10.2735633',
       '4.7064352', '24.7736546', '-21.3019905', '-1.3397668',
       '4.5693754', '26.8234472', '25.029422', '23.0131338',
       '-20.2759451', '-41.5000831', '39.3260685', '-13.7693895',
       '-8.7053941', '36.5748441', '26.2540493', 'n', '-12.2045176',
       '-21.130737949999997', '-16.03442485', '54.7023545', '24.0002488',
       '12.7503486', '-2.4833826', '35.000074', '4.099917', '16.0000552',
       '-18.1239696', '19.0974031', '19.703182249999998', '12.51756625',
       '-19.302233', '18.2214149', '42.6384261', '13.581921',
       '38.9953683', '48.3500473', '46.603354', '-5.6816069',
       '10.8677845', '0.3448612', '31.5313113', '-7.338358449999999',
       '23.9739374', '18.1850507', '31.462420950000002',
       '13.450125700000001', '-4.6574977', '16.8259793', '9.6000359',
       '-19.9160819', '56.7861112', '61.0666922', '45.5643442',
      

EJERCICIO 2: Si nos fijamos en la columna de country es my complicado codificar esa columna ya que hay muchísimas categorías. Para ello, vamos a usar una librería que llamada geopy que nos devuelve las coordenadas de un pais, ciudad, pueblo, lo que imaginemos! El objetivo de esto es poder clasificar nuestros paises en Hemisferio Norte y Sur para poder hacer una codificación más eficaz.
Como ya tenemos las coordenadas, deberéis codificar la columna en función de si pertenecen al Hemisferio Norte (HN) o Sur (HS). Aquellas que pertencezcan al HN le pondremos un 1 y a las del HS le pondremos un 0.

In [31]:
def extraer_coordenadas(x):
    try:
        geolocator = Nominatim(user_agent = "ana")
        location = geolocator.geocode(x)
        return location.lat, location.long
    except:
        return "no hay datos"

Creamos una nueva columna para saber en qué hemisferio está el país.

In [61]:
lista_hemisferios = []

In [62]:
def hemisferio(x):
    latitud = df[x]
    for row in latitud:
        try:
            if float(row) > 0:
                lista_hemisferios.append('HN')
            elif float(row) < 0:
                lista_hemisferios.append('HS')
            else:
                pass
        except ValueError:
            lista_hemisferios.append(row)

In [63]:
hemisferio('latitud')

In [None]:
print(lista_hemisferios)

In [64]:
df['hemisferio'] = lista_hemisferios

In [65]:
df.head()

Unnamed: 0,year,type,country,age,species_,fecha_limpia,fatal,sex,latitud,longitud,hemisferio
0,2018,Boating,usa,57.0,White shark,Jun,N,F,39.7837304,-100.445882,HN
1,2018,Unprovoked,usa,11.0,Unespecific,Jun,N,F,39.7837304,-100.445882,HN
2,2018,Invalid,usa,48.0,Unespecific,Jun,N,M,39.7837304,-100.445882,HN
3,2018,Unprovoked,australia,27.0,Unespecific,Jun,N,M,-24.7761086,134.755,HS
4,2018,Provoked,mexico,27.0,Tiger shark,Jun,N,M,23.6585116,-102.0077097,HN


Sustituimos en la columna 'hemisferio' los valores por 0 ó 1 en función de si es HN o HS.

In [66]:
label_encoded_df = df.copy()

le = LabelEncoder()

In [None]:
for row in label_encoded_df['hemisferio']:
    label_encoded_df['hemisferio']=le.fit_transform(label_encoded_df['hemisferio'])

In [67]:
label_encoded_df.head()

Unnamed: 0,year,type,country,age,species_,fecha_limpia,fatal,sex,latitud,longitud,hemisferio
0,2018,Boating,usa,57.0,White shark,Jun,N,F,39.7837304,-100.445882,0
1,2018,Unprovoked,usa,11.0,Unespecific,Jun,N,F,39.7837304,-100.445882,0
2,2018,Invalid,usa,48.0,Unespecific,Jun,N,M,39.7837304,-100.445882,0
3,2018,Unprovoked,australia,27.0,Unespecific,Jun,N,M,-24.7761086,134.755,1
4,2018,Provoked,mexico,27.0,Tiger shark,Jun,N,M,23.6585116,-102.0077097,0


EJERCICIO 3: Aplicad un get_dummies a las columnas de fatal y age.

In [68]:
df_dummies = label_encoded_df.copy()

In [71]:
dummies_fatal = pd.get_dummies(df_dummies["fatal"], prefix_sep = "_", prefix = "fatal", dtype = int)
dummies_fatal.head(20)

Unnamed: 0,fatal_N,fatal_Unknown,fatal_Y
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,1,0,0
6,0,0,1
7,1,0,0
8,1,0,0
9,1,0,0


In [72]:
dummies_age = pd.get_dummies(df_dummies["age"], prefix_sep = "_", prefix = "age", dtype = int)
dummies_age.head(20)

Unnamed: 0,age_1.0,age_2.0,age_3.0,age_5.0,age_6.0,age_7.0,age_8.0,age_9.0,age_10.0,age_11.0,age_12.0,age_13.0,age_14.0,age_15.0,age_16.0,age_17.0,age_18.0,age_19.0,age_20.0,age_21.0,age_22.0,age_23.0,age_24.0,age_25.0,age_26.0,age_27.0,age_28.0,age_29.0,age_30.0,age_31.0,age_32.0,age_33.0,age_34.0,age_35.0,age_36.0,age_37.0,age_38.0,age_39.0,age_40.0,age_41.0,age_42.0,age_43.0,age_44.0,age_45.0,age_46.0,age_47.0,age_48.0,age_49.0,age_50.0,age_51.0,age_52.0,age_53.0,age_54.0,age_55.0,age_56.0,age_57.0,age_58.0,age_59.0,age_60.0,age_61.0,age_62.0,age_63.0,age_64.0,age_65.0,age_66.0,age_67.0,age_68.0,age_69.0,age_70.0,age_71.0,age_72.0,age_73.0,age_74.0,age_75.0,age_77.0,age_78.0,age_81.0,age_82.0,age_84.0,age_86.0,age_87.0
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


EJERCICIO 4: Aplicad un Label Encoding a la columna species, fecha y type.

In [74]:
for col in label_encoded_df['species_']:
    label_encoded_df['species_']=le.fit_transform(label_encoded_df['species_'])