# Introducción al análisis de datos
# EDA (Análisis exploratorio de datos)
1. Cargar, guardar y manejar los datos (dataset) -> Pandas (dataframes)
2. Numpy (Trabajar de forma eficiente los arrays)
3. Visualización -> matplotlib

# Pandas
Basado en dataframes

In [9]:
import pandas as pd

### Fundamentos 
Tiene dos tipos de datos:
1. Serie
2. Dataframe

In [10]:
# Serie: Vector Unidimensional
paises = pd.Series(['Ecuador','Paraguay','Perú','Argentina','Colombia'])
paises

0      Ecuador
1     Paraguay
2         Perú
3    Argentina
4     Colombia
dtype: object

In [11]:
capitales = pd.Series(['Quito','Asunción','Lima','BA','Bogota'])
capitales

0       Quito
1    Asunción
2        Lima
3          BA
4      Bogota
dtype: object

In [12]:
# 2. Dataframes
# Vector de varias dimensiones (matriz)
paises_capitales = pd.DataFrame({'País': paises, 'Capital': capitales})
paises_capitales

Unnamed: 0,País,Capital
0,Ecuador,Quito
1,Paraguay,Asunción
2,Perú,Lima
3,Argentina,BA
4,Colombia,Bogota


Guardar un dataframe en un archivo csv

In [22]:
paises_capitales.to_csv('paises.csv', index=True)

Cargar los datos

In [28]:
dataPaises = pd.read_csv('paises.csv', index_col=0)
dataPaises

Unnamed: 0,País,Capital
0,Ecuador,Quito
1,Paraguay,Asunción
2,Perú,Lima
3,Argentina,BA
4,Colombia,Bogota


# Métodos más importantes de pandas

In [30]:
dataCars = pd.read_csv('cars.csv')
dataCars

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


#### Descripción de datos

In [31]:
dataCars.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

In [34]:
# Columnas del csv
dataCars.columns
# En formato de lista
#dataCars.columns.to_list()

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [36]:
# Index -> numero de filas de datos en el csv
dataCars.index

RangeIndex(start=0, stop=10, step=1)

In [38]:
# información
dataCars.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           10 non-null     object
 1   Colour         10 non-null     object
 2   Odometer (KM)  10 non-null     int64 
 3   Doors          10 non-null     int64 
 4   Price          10 non-null     object
dtypes: int64(2), object(3)
memory usage: 528.0+ bytes


In [39]:
dataCars.describe()

Unnamed: 0,Odometer (KM),Doors
count,10.0,10.0
mean,78601.4,4.0
std,61983.471735,0.471405
min,11179.0,3.0
25%,35836.25,4.0
50%,57369.0,4.0
75%,96384.5,4.0
max,213095.0,5.0


In [40]:
dataCars.sum()

Make             ToyotaHondaToyotaBMWNissanToyotaHondaHondaToyo...
Colour               WhiteRedBlueBlackWhiteGreenBlueBlueWhiteWhite
Odometer (KM)                                               786014
Doors                                                           40
Price            $4,000.00$5,000.00$7,000.00$22,000.00$3,500.00...
dtype: object

#### Seleccionar y visualizar datos

In [41]:
dataCars.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [42]:
dataCars.tail()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [44]:
# Acceder a los datos de una columna específica
fabricante =dataCars.Make
fabricante

0    Toyota
1     Honda
2    Toyota
3       BMW
4    Nissan
5    Toyota
6     Honda
7     Honda
8    Toyota
9    Nissan
Name: Make, dtype: object

In [45]:
fabricantes2 = dataCars['Make']
fabricantes2

0    Toyota
1     Honda
2    Toyota
3       BMW
4    Nissan
5    Toyota
6     Honda
7     Honda
8    Toyota
9    Nissan
Name: Make, dtype: object

In [49]:
universidades = pd.DataFrame({'Universidad': ['EPN', 'UCE', 'ESPE', 'PUCE', 'UTE'], 
                            'Estudiantes': [85215,7800,5000,6000,780],                        
                            }, index=[0,1,2,3,3])
universidades

Unnamed: 0,Universidad,Estudiantes
0,EPN,85215
1,UCE,7800
2,ESPE,5000
3,PUCE,6000
3,UTE,780


Localización de los datos

La función loc, trabaja con el índice de los datos

In [50]:
universidades.loc[3]

Unnamed: 0,Universidad,Estudiantes
3,PUCE,6000
3,UTE,780


Localizar bajo posición, iloc

Por ejemplor encontrar el tercer registro, sin importar su indice

In [53]:
universidades.iloc[3]

Universidad    PUCE
Estudiantes    6000
Name: 3, dtype: object

In [58]:
# Sacar un conjunto de filas
universidades.iloc[:3]

Unnamed: 0,Universidad,Estudiantes
0,EPN,85215
1,UCE,7800
2,ESPE,5000


#### Obtener datos de filas y columnas en específico

In [66]:
registros = [1,3,4]
universidades.iloc[registros,1]

1    7800
3    6000
3     780
Name: Estudiantes, dtype: int64

#### Filtrado de datos

In [69]:
dataCars[dataCars['Make']=='Toyota']

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
5,Toyota,Green,99213,4,"$4,500.00"
8,Toyota,White,60000,4,"$6,250.00"


In [70]:
dataCars[dataCars['Odometer (KM)'] <= 78601]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [76]:
dataCars[(dataCars['Make']=='Toyota') & (dataCars['Odometer (KM)'] <= 78601)]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
2,Toyota,Blue,32549,3,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"


In [77]:
# Cruza de variables, ver el conteo de la cruza
pd.crosstab(dataCars['Make'], dataCars['Doors'])


Doors,3,4,5
Make,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
BMW,0,0,1
Honda,0,3,0
Nissan,0,2,0
Toyota,1,3,0


In [80]:
dataCars.groupby(['Make']).mean()

Unnamed: 0_level_0,Odometer (KM),Doors
Make,Unnamed: 1_level_1,Unnamed: 2_level_1
BMW,11179.0,5.0
Honda,62778.333333,4.0
Nissan,122347.5,4.0
Toyota,85451.25,3.75


In [96]:
dataCars.sample(frac=0.2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
2,Toyota,Blue,32549,3,"$7,000.00"
6,Honda,Blue,45698,4,"$7,500.00"


In [None]:
# Programación
