# Pandas

In [None]:
!pip install pandas 

# import sys
# sys.executable
# ! path/to/python -m pip install pandas

In [None]:
%%capture
!pip install jupyter_contrib_nbextensions;

In [None]:
%%capture
!jupyter contrib nbextension install;

In [None]:
%%capture
!jupyter install jupyterthemes

In [None]:
!jt -l

In [None]:
!jt -t monokai

In [None]:
%%capture
!jupyter nbextension enable codefolding/main;

In [None]:
import numpy as np
import pandas as pd

# What is Pandas?

Pandas can be thought as an enhanced version of numpy arrays. In this case, the rows and columns can be identified with labels instead of just simple integer indices.


* [Documentation](https://pandas.pydata.org/docs/reference/index.html#api)
* [GitHub](https://github.com/pandas-dev/pandas/blob/master/pandas/core/base.py)

There are **three** main pandas elements we **need** to understand.
1. Pandas Series
2. Pandas DataFrame
3. Index
----

# 1. The Pandas Series

**The primary building block of Pandas**. A pandas series is a one-dimensional (**1-D**) indexed array.

In [None]:
import pandas as pd
#from pandas import Series

In [None]:
type(pd.Series)

In [None]:
pd.Series

In [None]:
pd.Series(dtype='int')

## 1.1 Creating a Pandas Series

In [None]:
# From a list
values = [1, 2, 3, 4]
series1 = pd.Series(values)
print(series1)

In [None]:
nome_linhas = ['I', 'II', 'III', 'IV']

series2 = pd.Series(data = values, 
                    index = nome_linhas)

print(series2)

In [None]:
nome_linhas = [['I', 'II'], 'II', 'III', 'IV']
series2 = pd.Series(data = values, 
                    index = nome_linhas)
print(series2)

In [None]:
# From a dict

dict_notas = dict({'Titanic' : 7.8,
                  'Dune' : 8.2,
                  'Dune (David Lynch)' : 6.4,
                  'House of Gucci' : 7.0,
                  'Joker' : 8.4,
                  'Alien' : 8.4})
notas_imdb = pd.Series(dict_notas, name = 'nota_imdb')
print(notas_imdb)

In [None]:
# From a dict

dict_notas = dict({'Titanic' : 7.8,
                  'Dune' : 8.2,
                  'Dune (David Lynch)' : 6.4,
                  'House of Gucci' : 7.0,
                  'Joker' : 8.4,
                  'Alien' : 8.4})
nomes_errados = ['Dune', 'Dune DL', 'Titanic', 'Coringa', 'Gucci']

pd.Series(dict_notas, index = nomes_errados, name = 'nota_imdb')

In [None]:
dict_cast = dict({'Titanic' : ['Kate Winslet', 'Leonardo DiCaprio'],
                  'Dune' : ['Timothée Chalamet', 'Zendaya'],
                  'Dune (David Lynch)' : ['Sting'],
                  'House of Gucci' : ['Lady Gaga', 'Adam Driver', 'Al Pacino'],
                  'Joker' : ['Joaquin Phoenix'],
                  'Alien' : ['Sigourney Weaver', 'Ian Holm'],
                  'Aliens' : ['Sigourney Weaver', 'Paul Reiser']})

elenco = pd.Series(dict_cast, name = 'elenco')
print(elenco)

In [None]:
dict_diretor = dict({'Titanic' : 'James Cameron',
                     'Dune' : 'Denis Villeneuve',
                     'Dune (David Lynch)' : 'David Lynch',
                     'House of Gucci' : 'Ridley Scott',
                     'Joker' : 'Todd Phillips',
                     'Alien' : 'Ridley Scott',
                     'Aliens' : 'James Cameron'})
diretor = pd.Series(dict_diretor, name = 'diretor')
#diretor = pd.Series(dict_diretor, name = 'diretor', dtype = pd.StringDtype())
print(diretor)

## 1.2 Some methods and attributes
* Check Type
* Check Size
* `.describe()`
* `.values`
* `.index`

In [None]:
print(type(notas_imdb))

So, the `type` of `data` is a `pandas...Series` and the types of the data inside the `pandas.Series` is `float64`

In [None]:
print(notas_imdb)

When you see `dtype: object`, it usually means you have a `str` inside your `Series` - you should always work to specify string `Series` as a `pd.StringDtype()`!

In [None]:
print(len(notas_imdb))

In [None]:
print(notas_imdb.describe())

In [None]:
print(elenco.describe())

In [None]:
print(diretor.describe())

In [None]:
print(notas_imdb.values)

In [None]:
print(notas_imdb.index)

In [None]:
type(notas_imdb.index)

## 1.3 Accessing elements 

Can be done like a numpy array. 

In [None]:
notas_imdb[1]

In [None]:
notas_imdb[1:]

In [None]:
notas_imdb['Dune']

In [None]:
notas_imdb.iloc[1] == notas_imdb['Dune'] == notas_imdb[1]

# 2. The Pandas DataFrame


Pandas DataFrame can be thought as
* A group of Pandas Series
* A generalization of **2-D** numpy arrays (However, again, they bring flexibility on both the indices and column names)
* A tabular, spreadsheet-like data structure

In [None]:
pd.DataFrame

In [None]:
type(pd.DataFrame)

In [None]:
pd.DataFrame()

## 2.1 Creating a Pandas DataFrame
* From a .csv, excel, query, json, from an API...
Today we'll check how to create from a 1-D array, 2-D array and from a dictionary

In [None]:
dict_series = {'notas' : notas_imdb,
               'elenco' : elenco,
               'diretor' : diretor}
pd.DataFrame(dict_series)

In [None]:
dict_dicts = {'notas' : dict_notas,
              'elenco' : dict_cast,
              'diretor' : dict_diretor}
pd.DataFrame(dict_dicts)

In [None]:
lista_notas = list(dict_notas.values())
lista_elenco = list(dict_cast.values())
lista_diretor = list(dict_diretor.values())
print(lista_notas)

In [None]:
print(lista_elenco)

In [None]:
print(lista_diretor)

In [None]:
dict_listas = {'notas' : lista_notas,
               'elenco' : lista_elenco,
               'diretor' : lista_diretor}
pd.DataFrame(dict_listas)

In [None]:
dict_listas = {'elenco' : lista_elenco,
               'diretor' : lista_diretor}
pd.DataFrame(dict_listas)

## 2.2 Some methods and attributes
* `describe()`
* `info()`
* `transpose()`

In [None]:
tb_filmes = pd.DataFrame(dict_series)
tb_filmes

In [None]:
tb_filmes.describe()

In [None]:
tb_filmes.describe(percentiles = [0.01, 0.99])

In [None]:
tb_filmes.info()

In [None]:
tb_filmes.transpose()

## 2.3 Accessing elements 

These are the correct way to access data in a dataframe. You can specify both row and column. You can also specify only row.

`dataframe.loc[row_name, col_name]`

In [None]:
tb_filmes.loc['Alien', 'notas']

In [None]:
tb_filmes.loc['Alien', :]

In [None]:
tb_filmes.loc[:, 'notas']

In [None]:
tb_filmes.loc[['Alien', 'Aliens'], :]

In [None]:
tb_filmes.loc['Alien':'Dune', :]

In [None]:
tb_filmes.loc['Alien':'Dune', 'notas':'elenco']

In [None]:
tb_filmes.loc[['Alien', 'Titanic'], 1]

`dataframe.iloc[row_number, col_number]`

In [None]:
tb_filmes.iloc[0, 0]

In [None]:
tb_filmes.iloc[0, :]

In [None]:
tb_filmes.iloc[:, 0]

In [None]:
tb_filmes.iloc[:, 0]

In [None]:
tb_filmes.iloc[:, [0, 1]]

In [None]:
tb_filmes.iloc[0:3, :]

In [None]:
tb_filmes.iloc['Alien', 1]

What is the difference of selecting a column via: `dataframe['column']` vs `dataframe.loc[:, 'column']`?

In [None]:
type(tb_filmes['notas'])

In [None]:
type(tb_filmes.loc[:,'notas'])

Selecting by a sample

In [None]:
tb_filmes.sample(n=2)

## 2.4 Filtering

In [None]:
tb_filmes['notas'] > 8

In [None]:
tb_filmes['diretor'] == 'Ridley Scott'

In [None]:
filmes_bons = tb_filmes['notas'] > 8

In [None]:
filmes_bons

In [None]:
tb_filmes[filmes_bons]

In [None]:
tb_filmes[tb_filmes['notas'] > 8]

In [None]:
tb_filmes[(tb_filmes['notas'] > 8) & (tb_filmes['diretor'] == 'Ridley Scott')]

In [None]:
tb_filmes.loc[(tb_filmes['notas'] > 8) & (tb_filmes['diretor'] == 'Ridley Scott'), 'elenco']

In [None]:
tb_filmes.index == 'Alien'

In [None]:
tb_filmes.index.isin(['Alien', 'Aliens'])