# Entradas y salidas

In [1]:
import numpy as np
import pandas as pd

# Definimos un dataframe con datos de ejemplos
df = pd.DataFrame(np.random.randn(10, 5), columns=["A", "B", "C", "D", "E"])

df

Unnamed: 0,A,B,C,D,E
0,0.423316,1.128516,0.143195,-1.412365,0.948779
1,0.988903,-1.82229,0.889723,-0.996754,-0.951893
2,-0.062858,-0.587966,-0.543627,-0.384588,1.262243
3,2.08913,-0.500271,-1.09068,-0.335303,-1.062144
4,-1.081333,-1.59027,1.35575,-0.348654,-0.46529
5,0.013306,0.176789,-1.237786,-0.41191,2.284914
6,-0.306311,2.02369,0.912722,0.474339,0.656709
7,0.562535,0.028841,-1.126961,0.246404,1.175491
8,-0.652287,0.982229,1.177281,-0.735292,2.597402
9,0.694994,-0.541511,-0.066688,0.335089,0.34504


## CSV

### Guardar a CSV

In [6]:
df.to_csv('datos.csv', index=False)

NameError: name 'df' is not defined

In [7]:
# Borramos el df de la memoria
del(df)

NameError: name 'df' is not defined

### Cargar desde CSV

In [9]:
df = pd.read_csv('datos.csv')

df

Unnamed: 0,A,B,C,D,E
0,0.423316,1.128516,0.143195,-1.412365,0.948779
1,0.988903,-1.82229,0.889723,-0.996754,-0.951893
2,-0.062858,-0.587966,-0.543627,-0.384588,1.262243
3,2.08913,-0.500271,-1.09068,-0.335303,-1.062144
4,-1.081333,-1.59027,1.35575,-0.348654,-0.46529
5,0.013306,0.176789,-1.237786,-0.41191,2.284914
6,-0.306311,2.02369,0.912722,0.474339,0.656709
7,0.562535,0.028841,-1.126961,0.246404,1.175491
8,-0.652287,0.982229,1.177281,-0.735292,2.597402
9,0.694994,-0.541511,-0.066688,0.335089,0.34504


## JSON

### Guardar a JSON

In [10]:
df.to_json('datos.json')

In [11]:
# Borramos el df de la memoria
del(df)

### Cargar desde JSON

In [12]:
df = pd.read_json('datos.json')

df

Unnamed: 0,A,B,C,D,E
0,0.423316,1.128516,0.143195,-1.412365,0.948779
1,0.988903,-1.82229,0.889723,-0.996754,-0.951893
2,-0.062858,-0.587966,-0.543627,-0.384588,1.262243
3,2.08913,-0.500271,-1.09068,-0.335303,-1.062144
4,-1.081333,-1.59027,1.35575,-0.348654,-0.46529
5,0.013306,0.176789,-1.237786,-0.41191,2.284914
6,-0.306311,2.02369,0.912722,0.474339,0.656709
7,0.562535,0.028841,-1.126961,0.246404,1.175491
8,-0.652287,0.982229,1.177281,-0.735292,2.597402
9,0.694994,-0.541511,-0.066688,0.335089,0.34504


## Excel

Necesitamos instalar el módulo `openpyxl` para generar y leer este formato:

    pip install openpyxl

In [None]:
!pip install openpyxl

### Guardar a Excel

In [14]:
df.to_excel('datos.xlsx', sheet_name='Sheet1', index=False)

In [15]:
# Borramos el df de la memoria
del(df)

### Cargar desde Excel

In [16]:
df = pd.read_excel('datos.xlsx', sheet_name='Sheet1')

df

Unnamed: 0,A,B,C,D,E
0,0.423316,1.128516,0.143195,-1.412365,0.948779
1,0.988903,-1.82229,0.889723,-0.996754,-0.951893
2,-0.062858,-0.587966,-0.543627,-0.384588,1.262243
3,2.08913,-0.500271,-1.09068,-0.335303,-1.062144
4,-1.081333,-1.59027,1.35575,-0.348654,-0.46529
5,0.013306,0.176789,-1.237786,-0.41191,2.284914
6,-0.306311,2.02369,0.912722,0.474339,0.656709
7,0.562535,0.028841,-1.126961,0.246404,1.175491
8,-0.652287,0.982229,1.177281,-0.735292,2.597402
9,0.694994,-0.541511,-0.066688,0.335089,0.34504


## HTML

Podemos extraer información directamente desde tablas de páginas web a partir de la URL. 

Esto se consigue haciendo web scrapping con los módulos `lxml` y `BeautifulSoup4`, por lo que necesitamos instalarlos:

    pip install lxml BeautifulSoup4

In [None]:
!pip install lxml BeautifulSoup4

Una vez instalado `lxml` y `BeautifulSoup4` reiniciamos el kernel (botón girando al lado de stop) y ya estaremos listos:

In [17]:
# Realizamos un scrapping de una tabla de la wikipedia
df = pd.read_html('https://web.archive.org/web/20220717170349/https://en.wikipedia.org/wiki/List_of_countries_by_past_fertility_rate')

In [25]:
df

[      0     1     2
 0   Jun   JUL   Dec
 1   NaN    17   NaN
 2  2021  2022  2023,
                                                     0          1          2   \
 0    Fertility rate per woman Country/dependent ter...        NaN        NaN   
 1                          Country/dependent territory  1950–1955  1955–1960   
 2                                          Afghanistan       7.45       7.45   
 3                                              Albania       6.23       6.55   
 4                                              Algeria       7.28       7.38   
 ..                                                 ...        ...        ...   
 199                                  Western Sahara[7]       6.34       6.42   
 200                                              Yemen       7.35       7.40   
 201                                             Zambia       6.70       6.95   
 202                                           Zimbabwe       6.80       7.00   
 203                    

In [22]:
type(df[1])

pandas.core.frame.DataFrame

Si se encuentra más de una tabla (como en el caso del ejemplo), podemos hacer referencia al primero a través del índice:

In [None]:
df[2]

Podemos hacer un poco de limpieza y dejar los datos presentables:

In [None]:
# Guardamos el dataframe
fertility_rate = df[2]

fertility_rate.head()

In [None]:
# Renombramos la primera columna para que sea más fácil consultarla
fertility_rate.rename(columns = {'Country/dependent territory':'Country'}, inplace=True)

fertility_rate

Ahora podemos realizar consultas cómodamente:

In [None]:
# Índice de natalidad por país entre los años 2010-2015
fertility_rate[["Country", "2010–2015"]]

In [None]:
# Misma consulta aplicando el styler para esconder la primera columna
fertility_rate[["Country", "2010–2015"]].head().style.hide(axis=0)

In [None]:
# Índice de natalidad por país entre los años 1985–1990 ordenado de más a menos (primeros resultados)
fertility_rate[["Country", "1985–1990"]].sort_values(by="1985–1990", ascending=False).head().style.hide(axis=0)

In [None]:
# Índice de natalidad por país entre los años 1985–1990 ordenado de más o menos (últimos resultados)
fertility_rate[["Country", "1985–1990"]].sort_values(by="1985–1990", ascending=False).tail().style.hide(axis=0)

In [None]:
# Vamos a transformar todas las columnas desde la segunda hasta la última a valores númericos
fertility_rate = fertility_rate[1:][:].apply(pd.to_numeric, errors='coerce')

In [None]:
# Ahora podemos consultar la media del índice de natalidad para cada año
fertility_rate.mean()[1:]

Con `Matplotlib`, que aprenderemos en la próxima sección, podemos graficar estos resultados fácilmente:

In [None]:
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = 10,5

fertility_rate.mean()[1:].plot(kind='line', xlabel="Períodos", ylabel="Media de natalidad mundial")

La práctica hace el maestro, os animo a hacer vuestras propias pruebas y googlear cualquier cosa que se os ocurra para aprender mucho más.