# INSTALLING AND RUNNING PANDAS

In [1]:
import pandas as pd

In [2]:
#Ver la versión de pandas
pd.__version__

'1.1.4'

In [4]:
products = ['A','B','C','D']
products

['A', 'B', 'C', 'D']

In [5]:
type(products)

list

In [8]:
#Construyendo una serie
product_categories = pd.Series(products)
product_categories

0    A
1    B
2    C
3    D
dtype: object

In [9]:
type(product_categories)

pandas.core.series.Series

In [10]:
daily_rates_dollars = pd.Series([45,59,69,79])
daily_rates_dollars

0    45
1    59
2    69
3    79
dtype: int64

## Working with Attributes

In [11]:
#Saber cuantos datos hay dentro de esa variable
product_categories.size

4

In [12]:
#Asignando un nombre al objeto
product_categories.name = 'Products Categories'
product_categories

0    A
1    B
2    C
3    D
Name: Products Categories, dtype: object

## Using an Index

In [16]:
#Creando diccionario
prices_per_category = {'PA': 13455, 'PB':34562, 'PC':64729, 'PD':44721}
prices_per_category

{'PA': 13455, 'PB': 34562, 'PC': 64729, 'PD': 44721}

In [14]:
type(prices_per_category)

dict

In [17]:
#Convirtiendo el dict en una serie
prices_per_category = pd.Series(prices_per_category)
prices_per_category

PA    13455
PB    34562
PC    64729
PD    44721
dtype: int64

In [18]:
#Visualizando indices
prices_per_category.index

Index(['PA', 'PB', 'PC', 'PD'], dtype='object')

## Label - based vs position - based indexing

In [4]:
series_a = pd.Series([10,20,30,40])
series_a

0    10
1    20
2    30
3    40
dtype: int64

In [5]:
series_a.index

RangeIndex(start=0, stop=4, step=1)

In [7]:
list(series_a.index)

[0, 1, 2, 3]

## More on working with indices in python

In [2]:
series_a = pd.Series([10,20,30,40,50])
series_a

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [3]:
series_a[0]

10

In [7]:
prices_per_category = pd.Series({'PA':13455, 'PB':34562, 'PC':64729, 'PD':44721})
prices_per_category                                

PA    13455
PB    34562
PC    64729
PD    44721
dtype: int64

In [8]:
prices_per_category['PA']

13455

In [9]:
prices_per_category[0]

13455

In [13]:
# Cambiando el índice, en lugar de 0 al 4, ahora es 1 al 5
series_a = pd.Series([10,20,30,40,50], index= [1,2,3,4,5])
series_a

1    10
2    20
3    30
4    40
5    50
dtype: int64

In [17]:
# Comprobación
series_a[1]

10

In [19]:
# Índices como strings
series_b = pd.Series([15,25,35,45,55], index = ["1","2","3","4","5"])
series_b

1    15
2    25
3    35
4    45
5    55
dtype: int64

In [20]:
series_b[1]

25

In [21]:
series_b["1"]

15

In [22]:
series_b[0]

15

######  Las etiquetas son utiles cuando se trabaja con visualización de datos, se pueden identificar nombres , cuando se trabaja con índices basados en posición resultan utiles cuando neesitamos las posiciones de los elementos en una serie

## Using Methods in Python

In [25]:
start_date_deposits = pd.Series({
    '4/07/2014': 2000,
    '24/04/2015': 3500,
    '02/05/2014': 2300,
    '19/02/2016': 3200,
    '13/06/2014': 3100,
    '23/09/2016': 2400,
    '03/12/2016': 3000,
    '12/07/2014': 2300,
    '27/07/2016': 2700,
    '13/07/2015': 2500,
    '10/04/2014': 2300
})

In [26]:
start_date_deposits

4/07/2014     2000
24/04/2015    3500
02/05/2014    2300
19/02/2016    3200
13/06/2014    3100
23/09/2016    2400
03/12/2016    3000
12/07/2014    2300
27/07/2016    2700
13/07/2015    2500
10/04/2014    2300
dtype: int64

In [27]:
start_date_deposits.sum()

29300

In [28]:
start_date_deposits.min()

2000

In [29]:
start_date_deposits.max()

3500

In [30]:
# Para obtener el índice con el valor max se utiliza IDXMAX()
start_date_deposits.idxmax()

'24/04/2015'

In [32]:
start_date_deposits.idxmin()

'4/07/2014'

In [33]:
start_date_deposits.head()

4/07/2014     2000
24/04/2015    3500
02/05/2014    2300
19/02/2016    3200
13/06/2014    3100
dtype: int64

In [34]:
start_date_deposits.tail()

03/12/2016    3000
12/07/2014    2300
27/07/2016    2700
13/07/2015    2500
10/04/2014    2300
dtype: int64

## Parameters vs Arguments

In [37]:
start_date_deposits.head(3)

4/07/2014     2000
24/04/2015    3500
02/05/2014    2300
dtype: int64

head(3) <---- un argumento(nuestra elección para n)   head() <---- un parametro

## Creating DataFrames from Scratch

#### #1: Construct a DataFrame from dictionary of lists

In [49]:
data = {'ProductName': ['Product A', 'Product B', 'Product C'], 'ProductPrice': [2345,4553,2343]}
df = pd.DataFrame(data)
df

Unnamed: 0,ProductName,ProductPrice
0,Product A,2345
1,Product B,4553
2,Product C,2343


#### #2: Construct a DataFrame from dictionary of lists + specify an index

In [50]:
data = {'ProductName': ['Product A', 'Product B', 'Product C'], 'ProductPrice': [2345,4553,2343]}
df = pd.DataFrame(data, index = ['A', 'B', 'C'])
df

Unnamed: 0,ProductName,ProductPrice
A,Product A,2345
B,Product B,4553
C,Product C,2343


#### #3: Construct DataFrames from a list of dictionaries

In [56]:
data = [{'ProductName': 'Product A', 'ProductPrice': 2332},
        {'ProductName': 'Product B', 'ProductPrice': 2345},
        {'ProductName': 'Product C', 'ProductPrice': 2235}]
df = pd.DataFrame(data)
df

Unnamed: 0,ProductName,ProductPrice
0,Product A,2332
1,Product B,2345
2,Product C,2235


#### #4: Construct a DataFrame from dictionary of pandas Series

In [57]:
product = pd.Series(['Product A', 'Product B', 'Product C'])
prices = pd.Series([2344,2322,2335])

In [58]:
data = {'ProductName': product, 'ProductPrice': prices}
df = pd.DataFrame(data)
df

Unnamed: 0,ProductName,ProductPrice
0,Product A,2344
1,Product B,2322
2,Product C,2335


In [61]:
product = pd.Series(['Product A', 'Product B', 'Product C'], index = ['A', 'B', 'C'])
prices = pd.Series([2344,2322,2335], index = ['A', 'B', 'C'])

data = {'ProductName': product, 'ProductPrice': prices}
df = pd.DataFrame(data)
df

Unnamed: 0,ProductName,ProductPrice
A,Product A,2344
B,Product B,2322
C,Product C,2335


#### #5: Construct a DataFrame from list of lists

In [62]:
data = [['Product A', 2344], ['Product B', 2432], ['Product C', 2453]]
df = pd.DataFrame(data)
df

Unnamed: 0,0,1
0,Product A,2344
1,Product B,2432
2,Product C,2453


In [63]:
# Agregando columnas al DF

df.columns = ['ProductName', 'ProductPrice']
df

Unnamed: 0,ProductName,ProductPrice
0,Product A,2344
1,Product B,2432
2,Product C,2453


In [64]:
# Añadiendo índice
df.index = ['A', 'B', 'C']
df

Unnamed: 0,ProductName,ProductPrice
A,Product A,2344
B,Product B,2432
C,Product C,2453


#### #6: Construct a DataFrame in a professional way

In [66]:
# Se necesita información acerca de: datos, columnas e índice
df = pd.DataFrame(data = [['Product A', 23453], ['Product B',34311], ['Product C', 43223]],
                  columns = ['ProductName', 'ProductPrice'],
                  index = ['A', 'B', 'C'])
df

Unnamed: 0,ProductName,ProductPrice
A,Product A,23453
B,Product B,34311
C,Product C,43223


In [68]:
# Proporciona el número de filas y columnas que se tiene en el DF
df.shape

(3, 2)

##### Data Cleaning : Elimina inconsistencias - - - Data Prepocessing: Aplica metodos estadísticos

## Pandas Series: .unique() & .nunique()

In [69]:
data = pd.read_csv('Location.csv', squeeze = True)
location_data = data.copy() #metodo devuelve una nueva lista.
location_data.head()

0     Location 3
1     Location 6
2     Location 8
3    Location 26
4    Location 34
Name: Location, dtype: object

In [71]:
type(location_data)

pandas.core.series.Series

In [75]:
location_data.describe() #Total de valores, valores únicos, el valor top, la frecuencia del valor top

count            1043
unique            296
top       Location 25
freq               31
Name: Location, dtype: object

In [76]:
# Total de valores
len(location_data)

1043

In [77]:
# Devuelve los valores únicos
location_data.nunique()

296

In [78]:
# Devuelve cada valor único
location_data.unique()

array(['Location 3', 'Location 6', 'Location 8', 'Location 26',
       'Location 34', 'Location 25', 'Location 46', 'Location 156',
       'Location 21', 'Location 13', 'Location 579', 'Location 602',
       'Location 10', 'Location 44', 'Location 30', 'Location 48',
       'Location 196', 'Location 64', 'Location 91', 'Location 62',
       'Location 75', 'Location 42', 'Location 233', 'Location 95',
       'Location 78', 'Location 61', 'Location 87', 'Location 19',
       'Location 115', 'Location 350', 'Location 377', 'Location 17',
       'Location 113', 'Location 81', 'Location 58', 'Location 212',
       'Location 53', 'Location 337', 'Location 41', 'Location 632',
       'Location 73', 'Location 214', 'Location 218', 'Location 38',
       'Location 172', 'Location 197', 'Location 101', 'Location 185',
       'Location 129', 'Location 235', 'Location 142', 'Location 50',
       'Location 76', 'Location 11', 'Location 33', 'Location 22',
       'Location 145', 'Location 203', 'Loca

## Converting Series into Arrays

In [79]:
import pandas as pd
import numpy as np

In [80]:
prices_per_category = pd.Series({'Product A': 2443, 'Product B': 2332, 'Product C': 2422})
prices_per_category

Product A    2443
Product B    2332
Product C    2422
dtype: int64

In [82]:
# Obteniendo valores de 'prices_per_category'
prices_per_category.values

array([2443, 2332, 2422], dtype=int64)

In [84]:
# Tipo de dato - - - 
# Se recomienda usar Series.array() o Series.to_numpy(), 
# dependiendo de si necesita una referencia a los datos subyacentes o una matriz NumPy.
type(prices_per_category.values)

numpy.ndarray

In [91]:
prices_per_category.array

<PandasArray>
[2443, 2332, 2422]
Length: 3, dtype: int64

In [87]:
type(prices_per_category.array)

pandas.core.arrays.numpy_.PandasArray

In [89]:
prices_per_category.to_numpy()

array([2443, 2332, 2422], dtype=int64)

In [94]:
# Conviriendo a float
test_array = prices_per_category[['Product A', 'Product B']].to_numpy(dtype = 'float')
test_array

array([2443., 2332.])

In [98]:
# Verificando el tipo de dato
type(test_array[0])

numpy.float64

In [99]:
prices_per_category.values[0]

2443

In [101]:
type(prices_per_category.values[0])

numpy.int64

In [102]:
prices_per_category.array[0]

2443

In [104]:
prices_per_category.to_numpy()[0]

2443

## Sort Values

In [106]:
numbers = pd.Series([12,564,2454,24578,43])
numbers

0       12
1      564
2     2454
3    24578
4       43
dtype: int64

In [107]:
numbers.sort_values()

0       12
4       43
1      564
2     2454
3    24578
dtype: int64

In [108]:
numbers.sort_values(ascending=True)

0       12
4       43
1      564
2     2454
3    24578
dtype: int64

In [109]:
numbers.sort_values(ascending=False)

3    24578
2     2454
1      564
4       43
0       12
dtype: int64

In [110]:
data = pd.read_csv('Location.csv', squeeze = True)
location_data = data.copy() #metodo devuelve una nueva lista.
location_data.head()

0     Location 3
1     Location 6
2     Location 8
3    Location 26
4    Location 34
Name: Location, dtype: object

In [111]:
location_data.sort_values()

637     Location 1
884     Location 1
465     Location 1
716    Location 10
623    Location 10
          ...     
482    Location 97
128    Location 97
669    Location 97
757    Location 98
372    Location 99
Name: Location, Length: 1043, dtype: object

In [112]:
location_data.sort_values(ascending=False)

372    Location 99
757    Location 98
669    Location 97
128    Location 97
482    Location 97
          ...     
623    Location 10
716    Location 10
465     Location 1
884     Location 1
637     Location 1
Name: Location, Length: 1043, dtype: object