# <font color='blue'>Curso de Extensão: Introdução a Linguagem Python para Ciência de Dados - Exemplos Dataframe</font>

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes

## Create a dataframe

In [None]:
#Cria um DatetimeIndex contendo o final de 10 trimestres (Quarter) a partir do dia 31/03/2021
months = pd.date_range('20210331', periods=10, freq='Q')
months

DatetimeIndex(['2021-03-31', '2021-06-30', '2021-09-30', '2021-12-31',
               '2022-03-31', '2022-06-30', '2022-09-30', '2022-12-31',
               '2023-03-31', '2023-06-30'],
              dtype='datetime64[ns]', freq='Q-DEC')

In [None]:
#Cria um dataframe com valores randômicos utilizando o DateTimeIndex criado acima
#contendo 10 linhas e 5 colunas
df = pd.DataFrame(np.random.randn(10,5),index=months,columns=list('ABCDE'))
df

Unnamed: 0,A,B,C,D,E
2021-03-31,1.041977,-1.50622,-3.265068,-0.114178,-1.073944
2021-06-30,1.533097,0.387277,0.342867,-0.552404,0.678846
2021-09-30,1.20115,0.659578,-1.535718,-2.171045,-0.376175
2021-12-31,-1.877862,1.599627,0.495952,1.599649,0.168782
2022-03-31,0.957262,-0.000644,-0.284366,-1.502305,1.605213
2022-06-30,0.941425,-0.525465,-0.351474,-0.685999,0.142392
2022-09-30,-1.474353,0.450535,-0.901095,-0.145378,-0.459342
2022-12-31,-1.315804,0.507371,1.413869,2.369116,-0.371359
2023-03-31,0.208246,-0.285272,0.826708,-0.443004,-2.586211
2023-06-30,0.350065,-0.226154,1.554786,-0.660589,0.760236


In [None]:
#Cria um dicionário contendo valores de salário e idade
mydict = {"Income":(50,25,15,10,np.NAN,np.NAN,20,10,np.NAN,999), "Age":(19,20,30,np.NAN,np.NAN,25,20,52,46,9999)}
mydict

{'Income': (50, 25, 15, 10, nan, nan, 20, 10, nan, 999),
 'Age': (19, 20, 30, nan, nan, 25, 20, 52, 46, 9999)}

In [None]:
#Converte um dicionário para dataframe
mydata = pd.DataFrame(mydict)
mydata

Unnamed: 0,Income,Age
0,50.0,19.0
1,25.0,20.0
2,15.0,30.0
3,10.0,
4,,
5,,25.0
6,20.0,20.0
7,10.0,52.0
8,,46.0
9,999.0,9999.0


In [None]:
mydata.shape

(10, 2)

In [None]:
mydata.describe()

Unnamed: 0,Income,Age
count,7.0,8.0
mean,161.285714,1276.375
std,369.652862,3524.494733
min,10.0,19.0
25%,12.5,20.0
50%,20.0,27.5
75%,37.5,47.5
max,999.0,9999.0


In [None]:
#Substitui valores incorretos por NaN
mydata=mydata.replace((9999,999),np.NAN)
mydata

Unnamed: 0,Income,Age
0,50.0,19.0
1,25.0,20.0
2,15.0,30.0
3,10.0,
4,,
5,,25.0
6,20.0,20.0
7,10.0,52.0
8,,46.0
9,,


In [None]:
mydata.describe()

Unnamed: 0,Income,Age
count,6.0,7.0
mean,21.666667,30.285714
std,15.055453,13.450084
min,10.0,19.0
25%,11.25,20.0
50%,17.5,25.0
75%,23.75,38.0
max,50.0,52.0


In [None]:
#Verifica se o dataframe possui valores nulos
mydata.isnull()

Unnamed: 0,Income,Age
0,False,False
1,False,False
2,False,False
3,False,True
4,True,True
5,True,False
6,False,False
7,False,False
8,True,False
9,True,True


In [None]:
#Substitui um valor específico
new=mydata.replace({'Age':{20:15},'Income':{25:33}})
new

Unnamed: 0,Income,Age
0,50.0,19.0
1,33.0,15.0
2,15.0,30.0
3,10.0,
4,,
5,,25.0
6,20.0,15.0
7,10.0,52.0
8,,46.0
9,,


In [None]:
#Deleta todas as linhas que possuem algum valor faltante
new = mydata.dropna(how='any')
new

Unnamed: 0,Income,Age
0,50.0,19.0
1,25.0,20.0
2,15.0,30.0
6,20.0,20.0
7,10.0,52.0


In [None]:
#Elimina todas as linhas  que possuem valores faltantes
new=mydata.dropna(how='all')
new

Unnamed: 0,Income,Age
0,50.0,19.0
1,25.0,20.0
2,15.0,30.0
3,10.0,
5,,25.0
6,20.0,20.0
7,10.0,52.0
8,,46.0


In [None]:
#Substitui valores faltantes com a média da respectiva coluna arrendondando para 1 casa decimal (round)
new=np.round(mydata.fillna(mydata.mean()),2)
#new=mydata.fillna(mydata.mean())
new

Unnamed: 0,Income,Age
0,50.0,19.0
1,25.0,20.0
2,15.0,30.0
3,10.0,30.29
4,21.67,30.29
5,21.67,25.0
6,20.0,20.0
7,10.0,52.0
8,21.67,46.0
9,21.67,30.29


In [None]:
#Substitui valores faltantes com qualquer número da sua escolha
new = mydata.fillna({'Age':30,'Income':15})
new

Unnamed: 0,Income,Age
0,50.0,19.0
1,25.0,20.0
2,15.0,30.0
3,10.0,30.0
4,15.0,30.0
5,15.0,25.0
6,20.0,20.0
7,10.0,52.0
8,15.0,46.0
9,15.0,30.0
