# Python Para Analise de Dados - Pandas

In [1]:
# Importando as Bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

# Warnings
import sys
import warnings
if not sys.warnoptions:
    warnings.simplefilter('ignore')

In [2]:
# Lendo uma base de dados no formato .csv.
# o parâmetro sep é usado para definir qual o separador entre os dados.
# o parâmetro header informo em qual linha está minhas colunas ou se elas não existem.
# Se não existe colunas (header=None) o pandas dará um número para cada atributo da base.

arquivo = 'dados/kc_house_data.csv'
dataset = pd.read_csv(arquivo, sep=',', header=0)

In [3]:
# Imprimindo o tipo da variável dataset
# Dataframe é um estrutura de dados onde linhas podem ter colunas de diferentes tipos.

type(dataset)

pandas.core.frame.DataFrame

In [4]:
# Metodo head() imprime as 5 linhas iniciais do dataframe.

dataset.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2.0,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [5]:
# O parâmetro index_col informa a coluna na qual o dataframe será indexado

dataset = pd.read_csv(arquivo, sep=',', index_col='date')
dataset.head(3)

Unnamed: 0_level_0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
20141013T000000,7129300520,221900.0,3.0,1.0,1180,5650,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
20141209T000000,6414100192,538000.0,3.0,2.25,2570,7242,2.0,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
20150225T000000,5631500400,180000.0,2.0,1.0,770,10000,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062


In [6]:
# O Parametro Usecols eu indico apenas as colunhas que quero visualiza/utiliza

dataset = pd.read_csv(arquivo, sep=',', usecols=['id', 'date', 'price', 'bedrooms'])
dataset.head(3)

Unnamed: 0,id,date,price,bedrooms
0,7129300520,20141013T000000,221900.0,3.0
1,6414100192,20141209T000000,538000.0,3.0
2,5631500400,20150225T000000,180000.0,2.0


In [7]:
# Chamando novamente o dataset

dataset = pd.read_csv(arquivo, sep=',')
dataset.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2.0,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [8]:
# Imprimindo a 100 primeiras linhas do dataframe.

dataset.head(100)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2.0,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1483300570,20140908T000000,905000.0,4.0,2.50,3300,10250,1.0,0,0,...,7,2390,910,1946,1991,98040,47.5873,-122.249,1950,6045
96,3422049190,20150330T000000,247500.0,3.0,1.75,1960,15681,1.0,0,0,...,7,1960,0,1967,0,98032,47.3576,-122.277,1750,15616
97,1099611230,20140912T000000,199000.0,4.0,1.50,1160,6400,1.0,0,0,...,7,1160,0,1975,0,98023,47.3036,-122.378,1160,6400
98,722079104,20140711T000000,314000.0,3.0,1.75,1810,41800,1.0,0,0,...,7,1210,600,1980,0,98038,47.4109,-121.958,1650,135036


In [9]:
# Imprimindo as 5 Ultimas Linhas do DataFrame

dataset.tail()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
21608,263000018,20140521T000000,360000.0,3.0,2.5,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4.0,2.5,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3.0,2.5,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287
21612,1523300157,20141015T000000,325000.0,2.0,0.75,1020,1076,2.0,0,0,...,7,1020,0,2008,0,98144,47.5941,-122.299,1020,1357


In [10]:
# Imprimindo uma Amostra aleatoria do dataset

dataset.sample(10)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
16450,7016100380,20150423T000000,515000.0,4.0,2.5,1910,8947,1.0,0,0,...,8,1160,750,1970,0,98011,47.7374,-122.183,1920,7350
6639,6662410020,20150319T000000,471000.0,3.0,1.75,1640,10123,1.0,0,0,...,8,1340,300,1977,0,98011,47.7698,-122.167,2210,10852
563,2725069050,20140613T000000,863000.0,4.0,2.5,4120,22370,2.0,0,0,...,10,4120,0,1997,0,98074,47.6239,-122.023,3180,7257
20954,9826700707,20141028T000000,492000.0,3.0,2.5,1690,1479,3.0,0,0,...,8,1420,270,2005,0,98122,47.6022,-122.311,1280,1253
5691,2025600280,20150406T000000,241400.0,3.0,2.0,1420,9828,1.0,0,0,...,7,1420,0,1990,0,98010,47.3287,-122.011,1550,7227
8205,7899800450,20140828T000000,107000.0,2.0,1.0,670,4720,1.0,0,0,...,6,670,0,1948,0,98106,47.5243,-122.358,1480,4720
12447,8651442440,20141023T000000,164000.0,4.0,1.0,1530,4875,2.0,0,0,...,7,1530,0,1977,0,98042,47.3638,-122.091,1470,4875
14265,7683800212,20141218T000000,229000.0,3.0,1.0,1010,12705,1.0,0,0,...,7,1010,0,1959,0,98003,47.3348,-122.303,1490,10200
20458,7237501380,20150507T000000,1267500.0,4.0,3.5,4640,13404,2.0,0,0,...,10,4640,0,2007,0,98059,47.531,-122.134,4690,13590
20369,4385700250,20150407T000000,1800000.0,4.0,3.5,3480,4000,2.0,0,0,...,9,2460,1020,2015,0,98112,47.6356,-122.281,2620,4000


In [11]:
# Atributo columns retorna o nome das colunas do dataframe.

dataset.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [12]:
# Método count() retorna a quantidade de linhas de todas as colunas.

dataset.count()

id               21613
date             21613
price            21613
bedrooms         21609
bathrooms        21613
sqft_living      21613
sqft_lot         21613
floors           21612
waterfront       21613
view             21613
condition        21613
grade            21613
sqft_above       21613
sqft_basement    21613
yr_built         21613
yr_renovated     21613
zipcode          21613
lat              21613
long             21613
sqft_living15    21613
sqft_lot15       21613
dtype: int64

In [13]:
# Método describe() exibe informações estatísticas da base de dados. 
# Várias informações como desvio padrão, média, valor mínimo e valor máximo de colunas.

dataset.describe().round(3)

Unnamed: 0,id,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
count,21613.0,21613.0,21609.0,21613.0,21613.0,21613.0,21612.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0,21613.0
mean,4580302000.0,540088.142,3.371,2.115,2079.9,15106.968,1.494,0.008,0.234,3.409,7.657,1788.391,291.509,1971.005,84.402,98077.94,47.56,-122.214,1986.552,12768.456
std,2876566000.0,367127.196,0.93,0.77,918.441,41420.512,0.54,0.087,0.766,0.651,1.175,828.091,442.575,29.373,401.679,53.505,0.139,0.141,685.391,27304.18
min,1000102.0,75000.0,0.0,0.0,290.0,520.0,1.0,0.0,0.0,1.0,1.0,290.0,0.0,1900.0,0.0,98001.0,47.156,-122.519,399.0,651.0
25%,2123049000.0,321950.0,3.0,1.75,1427.0,5040.0,1.0,0.0,0.0,3.0,7.0,1190.0,0.0,1951.0,0.0,98033.0,47.471,-122.328,1490.0,5100.0
50%,3904930000.0,450000.0,3.0,2.25,1910.0,7618.0,1.5,0.0,0.0,3.0,7.0,1560.0,0.0,1975.0,0.0,98065.0,47.572,-122.23,1840.0,7620.0
75%,7308900000.0,645000.0,4.0,2.5,2550.0,10688.0,2.0,0.0,0.0,4.0,8.0,2210.0,560.0,1997.0,0.0,98118.0,47.678,-122.125,2360.0,10083.0
max,9900000000.0,7700000.0,33.0,8.0,13540.0,1651359.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,98199.0,47.778,-121.315,6210.0,871200.0


In [14]:
# Método describe() exibe informações estatísticas da base de dados. 
# Várias informações como desvio padrão, média, valor mínimo e valor máximo de colunas.
# Vendo apenas os formatos de string object

dataset.describe(include='O')

Unnamed: 0,date
count,21613
unique,372
top,20140623T000000
freq,142


In [15]:
# Retorna em formato de tupla a quantidade de linhas e colunas do dataset.

dataset.shape

(21613, 21)

In [16]:
# Imprime informações sobre colunas e uso de memória.

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21609 non-null  float64
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21612 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

# Analisando um DataFrame com Profiling

In [17]:
# Instalando o Profiling

# !pip install pandas-profiling

In [18]:
# !pip install markupsafe

In [19]:
# Importando o Pandas-profiling

import pandas_profiling

In [95]:
# Lendo o arquivo de dados e Construindo o DataFrame chamado df

arquivo = 'dados/kc_house_data.csv'
df = pd.read_csv(arquivo, sep=',', header=0)

### Usando o Profiling no Jupyter Notebook

In [21]:
# pandas_profiling.ProfileReport(df)

In [22]:
# Importando Como Relatorio HTML
'''profile = pandas_profiling.ProfileReport(df)
profile.to_file('report.html')'''

"profile = pandas_profiling.ProfileReport(df)\nprofile.to_file('report.html')"

# Trabalhando com Grandes Arquivos

+ Quando estamos trabalhando com _**Grandes Arquivos**_ temos um desafio um grande desafio que é gerenciar a memória.
+ As vezes precisamos manipular uma base de dados muito grande e por isso precisamos trabalhar com arquivos de forma diferente.
* Uma forma é ler esses arquivos de forma limitada para não consumir toda a _**memória**_ do servidor.

In [23]:
# Lendo as 5 primeiras linhas do arquivo.
# O Parametro nrows especifico ao DataFrame que quero ler apenas a quantidade de linhas desejada

dataset = pd.read_csv(arquivo, sep=',', nrows=5)
dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [24]:
# Lendo as 5 primeiras linhas do arquivo.

dataset = pd.read_csv(arquivo, sep=',')
dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2.0,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3.0,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [25]:
# O parâmetro chunksize define em quantas linhas cada bloco irá conter.

chunk = pd.read_csv(arquivo, chunksize=10000)
type(chunk)

pandas.io.parsers.readers.TextFileReader

In [26]:
# Imprimindo tamanho das partes do arquivo chunk.

for parte in chunk:
    print(len(parte))

10000
10000
1613


In [27]:
# O parâmetro chunksize define em quantas linhas cada bloco irá conter.
chunk = pd.read_csv(arquivo, chunksize=10000)

In [28]:
# Interando sobre cada parte do dataframe em seguida adicione o valor processado a uma nova coluna do dataset.

lista = []
for parte in chunk:
    lista.append(parte['bedrooms'] * 2)
    
dataset['bedrooms_size'] = pd.concat(lista)

In [29]:
dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,6.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639,6.0
2,5631500400,20150225T000000,180000.0,2.0,1.00,770,10000,1.0,0,0,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,4.0
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,8.0
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,1530,0,2009,0,98103,47.6993,-122.346,1530,1509,6.0
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310,5813,2.0,0,0,...,2310,0,2014,0,98146,47.5107,-122.362,1830,7200,8.0
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,1020,0,2009,0,98144,47.5944,-122.299,1020,2007,4.0
21611,291310100,20150116T000000,400000.0,3.0,2.50,1600,2388,2.0,0,0,...,1600,0,2004,0,98027,47.5345,-122.069,1410,1287,6.0


## Mais Recursos para trabalhar com grandes bases de dados

1.   Tente trabalhar apenas com as colunas que você vai realmente precisar.
2.   Atente para o tipo de dado de cada coluna.
3.   Visualize qual o separador usado para separar os dados.

In [102]:
# Usando o Metodo para descobrir o separador dos dados

#!head -n 5 dados/kc_house_data.csv

In [31]:
# Ler a base de Dados com o parametro nrows

df = pd.read_csv('dados/kc_house_data.csv', sep=',', nrows=5)
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900,3,1.0,1180,5650,1,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000,3,2.25,2570,7242,2,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000,2,1.0,770,10000,1,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000,4,3.0,1960,5000,1,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000,3,2.0,1680,8080,1,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [32]:
# Importando o nome das colunas com o parametro tolist

df.columns.tolist()

['id',
 'date',
 'price',
 'bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [33]:
# Exportando da Base de dados apenas as colunas que desejo utiliza
# Usando o parametro usecols

df = pd.read_csv(arquivo, usecols=['id',
                                 'date', 'price', 'bedrooms',
                                 'bathrooms', 'sqft_living',
                                 'sqft_lot', 'floors'])

df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0
2,5631500400,20150225T000000,180000.0,2.0,1.00,770,10000,1.0
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960,5000,1.0
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0
...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310,5813,2.0
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0
21611,291310100,20150116T000000,400000.0,3.0,2.50,1600,2388,2.0


In [34]:
# Lendo as colunas por posições

df = pd.read_csv(arquivo, usecols=[0, 1, 2, 3, 4, 5])
df

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570
2,5631500400,20150225T000000,180000.0,2.0,1.00,770
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680
...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020
21611,291310100,20150116T000000,400000.0,3.0,2.50,1600


In [35]:
# Lendo o arquivo completo e vendo o uso de memoria

df = pd.read_csv(arquivo, sep=',')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21609 non-null  float64
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21612 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [36]:
# Lendo todas as colunas exceto algumas

df = pd.read_csv(arquivo, usecols=[0, 1, 2, 3, 4, 5])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           21613 non-null  int64  
 1   date         21613 non-null  object 
 2   price        21613 non-null  float64
 3   bedrooms     21609 non-null  float64
 4   bathrooms    21613 non-null  float64
 5   sqft_living  21613 non-null  int64  
dtypes: float64(3), int64(2), object(1)
memory usage: 1013.2+ KB


In [37]:
# Lendo todas as colunas exceto algumas
# Usando  a expressao lambda

df = pd.read_csv(arquivo, usecols = lambda column: column not in ['sqft_living', 'sqft_lot', 'floors'])
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.0,0,0,3,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3.0,2.25,0,0,3,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2.0,1.0,0,0,3,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4.0,3.0,0,0,5,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3.0,2.0,0,0,3,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


# Trabalhando com os tipos de dados adequados

- Atenção para os tipos de dados **object**
- Dados que são categóricos podem receber o tipo de dados *category*

In [38]:
# Importando a base de dados do titanic

df = pd.read_csv('dados/train.csv')
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [39]:
# Informaçoes

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [40]:
# Convertendo os tipos de dados

df.Sex         = df.Sex.astype('category')
df.Embarked    = df.Embarked.astype('category')
df.Survived    = df.Survived.astype('category')
df.Pclass      = df.Pclass.astype('category')
df.PassengerId = df.PassengerId.astype('int32')
df.Parch       = df.Parch.astype('int32')
df.SibSp       = df.SibSp.astype('int32')

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int32   
 1   Survived     891 non-null    category
 2   Pclass       891 non-null    category
 3   Name         891 non-null    object  
 4   Sex          891 non-null    category
 5   Age          714 non-null    float64 
 6   SibSp        891 non-null    int32   
 7   Parch        891 non-null    int32   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     889 non-null    category
dtypes: category(4), float64(2), int32(3), object(3)
memory usage: 49.4+ KB


**Tivemos um ganho de quase 50% na memoria**

In [41]:
# Convertendo as colunas na hora da leitura
data = 'dados/train.csv'
df = pd.read_csv(data, dtype={'Embarked':'category', 'Survived':'category', 'Parch':'int32'})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    category
 2   Pclass       891 non-null    int64   
 3   Name         891 non-null    object  
 4   Sex          891 non-null    object  
 5   Age          714 non-null    float64 
 6   SibSp        891 non-null    int64   
 7   Parch        891 non-null    int32   
 8   Ticket       891 non-null    object  
 9   Fare         891 non-null    float64 
 10  Cabin        204 non-null    object  
 11  Embarked     889 non-null    category
dtypes: category(2), float64(2), int32(1), int64(3), object(4)
memory usage: 68.1+ KB


# Consultando um Dataset

* Podemos fazer _**consultas**_ em um Dataframe, isso se assemelha a linhagem SQL.

* Existem métodos interessantes para fazer consultas usando operadores lógicos (>,<,== ).

* Além disso podemos fazer consultas usando instruções de agrupamento, por exemplo. 

* Isso da muita flexibilidade para o Cientista de dados na hora de explorar da base de dados.

In [42]:
# Conta a quantidade de valores únicos em uma coluna

pd.value_counts(dataset['bedrooms'])

3.0     9822
4.0     6881
2.0     2759
5.0     1601
6.0      272
1.0      199
7.0       38
0.0       13
8.0       13
9.0        6
10.0       3
11.0       1
33.0       1
Name: bedrooms, dtype: int64

In [43]:
# O método loc() é usado para visualizar informações do dataset.
# Este método recebe uma lista por parâmetro e retorna o resultado da consulta.
# Consulta imóveis com 3 quartos

dataset.loc[dataset['bedrooms'] == 3]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,6.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639,6.0
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,6.0
6,1321400060,20140627T000000,257500.0,3.0,2.25,1715,6819,2.0,0,0,...,1715,0,1995,0,98003,47.3097,-122.327,2238,6819,6.0
7,2008000270,20150115T000000,291850.0,3.0,1.50,1060,9711,1.0,0,0,...,1060,0,1963,0,98198,47.4095,-122.315,1650,9711,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21603,7852140040,20140825T000000,507250.0,3.0,2.50,2270,5536,2.0,0,0,...,2270,0,2003,0,98065,47.5389,-121.881,2270,5731,6.0
21604,9834201367,20150126T000000,429000.0,3.0,2.00,1490,1126,3.0,0,0,...,1490,0,2014,0,98144,47.5699,-122.288,1400,1230,6.0
21607,2997800021,20150219T000000,475000.0,3.0,2.50,1310,1294,2.0,0,0,...,1180,130,2008,0,98116,47.5773,-122.409,1330,1265,6.0
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,1530,0,2009,0,98103,47.6993,-122.346,1530,1509,6.0


In [44]:
# Usando o método loc() junto com o operador &
# Consulta imóveis com 3 quartos e com o número de banheiros maior que 2

dataset.loc[(dataset['bedrooms'] == 3) & (dataset['bathrooms'] > 2)]

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639,6.0
6,1321400060,20140627T000000,257500.0,3.0,2.25,1715,6819,2.0,0,0,...,1715,0,1995,0,98003,47.3097,-122.327,2238,6819,6.0
9,3793500160,20150312T000000,323000.0,3.0,2.50,1890,6560,2.0,0,0,...,1890,0,2003,0,98038,47.3684,-122.031,2390,7570,6.0
10,1736800520,20150403T000000,662500.0,3.0,2.50,3560,9796,1.0,0,0,...,1860,1700,1965,0,98007,47.6007,-122.145,2210,8925,6.0
21,2524049179,20140826T000000,2000000.0,3.0,2.75,3050,44867,1.0,0,4,...,2330,720,1968,0,98040,47.5316,-122.233,4110,20336,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21601,5100403806,20150407T000000,467000.0,3.0,2.50,1425,1179,3.0,0,0,...,1425,0,2008,0,98125,47.6963,-122.318,1285,1253,6.0
21603,7852140040,20140825T000000,507250.0,3.0,2.50,2270,5536,2.0,0,0,...,2270,0,2003,0,98065,47.5389,-121.881,2270,5731,6.0
21607,2997800021,20150219T000000,475000.0,3.0,2.50,1310,1294,2.0,0,0,...,1180,130,2008,0,98116,47.5773,-122.409,1330,1265,6.0
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,1530,0,2009,0,98103,47.6993,-122.346,1530,1509,6.0


In [45]:
# O método sort_values() ordena o dataset pela coluna 'price' em ordem descrescente.
# Apenas o retorno da query será ordenado, não a organização do dataset.

dataset.sort_values(by='price', ascending=False)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size
7252,6762700020,20141013T000000,7700000.0,6.0,8.00,12050,27600,2.5,0,3,...,8570,3480,1910,1987,98102,47.6298,-122.323,3940,8800,12.0
3914,9808700762,20140611T000000,7062500.0,5.0,4.50,10040,37325,2.0,1,2,...,7680,2360,1940,2001,98004,47.6500,-122.214,3930,25449,10.0
9254,9208900037,20140919T000000,6885000.0,6.0,7.75,9890,31374,2.0,0,4,...,8860,1030,2001,0,98039,47.6305,-122.240,4540,42730,12.0
4411,2470100110,20140804T000000,5570000.0,5.0,5.75,9200,35069,2.0,0,0,...,6200,3000,2001,0,98039,47.6289,-122.233,3560,24345,10.0
1448,8907500070,20150413T000000,5350000.0,5.0,5.00,8000,23985,2.0,0,4,...,6720,1280,2009,0,98004,47.6232,-122.220,4600,21750,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8274,3883800011,20141105T000000,82000.0,3.0,1.00,860,10426,1.0,0,0,...,860,0,1954,0,98146,47.4987,-122.341,1140,11250,6.0
16198,3028200080,20150324T000000,81000.0,2.0,1.00,730,9975,1.0,0,0,...,730,0,1943,0,98168,47.4808,-122.315,860,9000,4.0
465,8658300340,20140523T000000,80000.0,1.0,0.75,430,5050,1.0,0,0,...,430,0,1912,0,98014,47.6499,-121.909,1200,7500,2.0
15293,40000362,20140506T000000,78000.0,2.0,1.00,780,16344,1.0,0,0,...,780,0,1942,0,98168,47.4739,-122.280,1700,10387,4.0


In [46]:
# Usando o método count() para contar o número de linhas de uma query.

dataset[dataset['bedrooms'] == 4].count()

id               6881
date             6881
price            6881
bedrooms         6881
bathrooms        6881
sqft_living      6881
sqft_lot         6881
floors           6881
waterfront       6881
view             6881
condition        6881
grade            6881
sqft_above       6881
sqft_basement    6881
yr_built         6881
yr_renovated     6881
zipcode          6881
lat              6881
long             6881
sqft_living15    6881
sqft_lot15       6881
bedrooms_size    6881
dtype: int64

# Alterando o Dataset

In [47]:
dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,6.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639,6.0
2,5631500400,20150225T000000,180000.0,2.0,1.00,770,10000,1.0,0,0,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,4.0
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,8.0
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,1530,0,2009,0,98103,47.6993,-122.346,1530,1509,6.0
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310,5813,2.0,0,0,...,2310,0,2014,0,98146,47.5107,-122.362,1830,7200,8.0
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,1020,0,2009,0,98144,47.5944,-122.299,1020,2007,4.0
21611,291310100,20150116T000000,400000.0,3.0,2.50,1600,2388,2.0,0,0,...,1600,0,2004,0,98027,47.5345,-122.069,1410,1287,6.0


In [48]:
# Adicionando uma coluna ao Dataframe.

dataset['size'] = (dataset['bedrooms'] * 20)
dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size,size
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,0,1955,0,98178,47.5112,-122.257,1340,5650,6.0,60.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,400,1951,1991,98125,47.7210,-122.319,1690,7639,6.0,60.0
2,5631500400,20150225T000000,180000.0,2.0,1.00,770,10000,1.0,0,0,...,0,1933,0,98028,47.7379,-122.233,2720,8062,4.0,40.0
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,910,1965,0,98136,47.5208,-122.393,1360,5000,8.0,80.0
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,0,1987,0,98074,47.6168,-122.045,1800,7503,6.0,60.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,0,2009,0,98103,47.6993,-122.346,1530,1509,6.0,60.0
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310,5813,2.0,0,0,...,0,2014,0,98146,47.5107,-122.362,1830,7200,8.0,80.0
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,0,2009,0,98144,47.5944,-122.299,1020,2007,4.0,40.0
21611,291310100,20150116T000000,400000.0,3.0,2.50,1600,2388,2.0,0,0,...,0,2004,0,98027,47.5345,-122.069,1410,1287,6.0,60.0


In [49]:
# Criando uma função para processamento de dados.

def categoriza(s):
    if s >= 80:
        return 'Big'
    elif s >= 60:
        return 'Medium'
    elif s >= 40:
        return 'Small'

In [50]:
# Criando uma nova coluna a partir do processamento realizado.

dataset['cat_size'] = dataset['size'].apply(categoriza)
dataset['cat_size']

0        Medium
1        Medium
2         Small
3           Big
4        Medium
          ...  
21608    Medium
21609       Big
21610     Small
21611    Medium
21612     Small
Name: cat_size, Length: 21613, dtype: object

In [51]:
# Ver a distribuicao da coluna com o método value_counts.

pd.value_counts(dataset['cat_size'])

Medium    9822
Big       8816
Small     2759
Name: cat_size, dtype: int64

In [52]:
# O método drop é usado para excluir dados no dataframe.
# A opção axis=1 define que queremos excluir uma coluna e não uma linha.
# O parâmetro inplace define que a alteração irá modificar o objeto em memória.

dataset.drop(['cat_size'], axis=1, inplace=True)

In [53]:
dataset.head(3)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size,size
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180,5650,1.0,0,0,...,0,1955,0,98178,47.5112,-122.257,1340,5650,6.0,60.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,400,1951,1991,98125,47.721,-122.319,1690,7639,6.0,60.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770,10000,1.0,0,0,...,0,1933,0,98028,47.7379,-122.233,2720,8062,4.0,40.0


In [54]:
# Apagando a coluna 'size'

dataset.drop(['size'], axis=1, inplace=True)

In [55]:
# Visualizando o dataset
dataset.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size
0,7129300520,20141013T000000,221900.0,3.0,1.0,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,6.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.721,-122.319,1690,7639,6.0
2,5631500400,20150225T000000,180000.0,2.0,1.0,770,10000,1.0,0,0,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,4.0
3,2487200875,20141209T000000,604000.0,4.0,3.0,1960,5000,1.0,0,0,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,8.0
4,1954400510,20150218T000000,510000.0,3.0,2.0,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,6.0


### Apagando linhas baseado em Condições Logicas

In [56]:
# Conta a quantidade de valores únicos

dataset.bedrooms.value_counts()

3.0     9822
4.0     6881
2.0     2759
5.0     1601
6.0      272
1.0      199
7.0       38
0.0       13
8.0       13
9.0        6
10.0       3
11.0       1
33.0       1
Name: bedrooms, dtype: int64

In [57]:
# Dropa linhas com bedrooms = 0

dataset.drop(dataset[dataset.bedrooms == 0].index, inplace=True)
dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,6.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639,6.0
2,5631500400,20150225T000000,180000.0,2.0,1.00,770,10000,1.0,0,0,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,4.0
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,8.0
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,1530,0,2009,0,98103,47.6993,-122.346,1530,1509,6.0
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310,5813,2.0,0,0,...,2310,0,2014,0,98146,47.5107,-122.362,1830,7200,8.0
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,1020,0,2009,0,98144,47.5944,-122.299,1020,2007,4.0
21611,291310100,20150116T000000,400000.0,3.0,2.50,1600,2388,2.0,0,0,...,1600,0,2004,0,98027,47.5345,-122.069,1410,1287,6.0


In [58]:
# Dropa linhas maiores que 30

dataset.drop(dataset[dataset.bedrooms > 30].index, inplace=True)
dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,bedrooms_size
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,1180,0,1955,0,98178,47.5112,-122.257,1340,5650,6.0
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639,6.0
2,5631500400,20150225T000000,180000.0,2.0,1.00,770,10000,1.0,0,0,...,770,0,1933,0,98028,47.7379,-122.233,2720,8062,4.0
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,1050,910,1965,0,98136,47.5208,-122.393,1360,5000,8.0
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,1680,0,1987,0,98074,47.6168,-122.045,1800,7503,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,1530,0,2009,0,98103,47.6993,-122.346,1530,1509,6.0
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310,5813,2.0,0,0,...,2310,0,2014,0,98146,47.5107,-122.362,1830,7200,8.0
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,1020,0,2009,0,98144,47.5944,-122.299,1020,2007,4.0
21611,291310100,20150116T000000,400000.0,3.0,2.50,1600,2388,2.0,0,0,...,1600,0,2004,0,98027,47.5345,-122.069,1410,1287,6.0


In [59]:
# Conta a quantidade de valores únicos

dataset.bedrooms.value_counts()

3.0     9822
4.0     6881
2.0     2759
5.0     1601
6.0      272
1.0      199
7.0       38
8.0       13
9.0        6
10.0       3
11.0       1
Name: bedrooms, dtype: int64

### Percorrendo linhas de um DataFrame Pandas

- Método iterrows() permite percorrer por todas as linhas de um dataframe.
- Esse método retorna um objeto **iterator** que contém o indice de cada linha e um cada linha em um dado do tipo série.

In [61]:
# Atribuindo o tipo de dataframe

type(dataset.iterrows())

generator

In [62]:
# Imprime a primeira linha do objeto iterator

next(dataset.iterrows())

(0,
 id                    7129300520
 date             20141013T000000
 price                   221900.0
 bedrooms                     3.0
 bathrooms                    1.0
 sqft_living                 1180
 sqft_lot                    5650
 floors                       1.0
 waterfront                     0
 view                           0
 condition                      3
 grade                          7
 sqft_above                  1180
 sqft_basement                  0
 yr_built                    1955
 yr_renovated                   0
 zipcode                    98178
 lat                      47.5112
 long                    -122.257
 sqft_living15               1340
 sqft_lot15                  5650
 bedrooms_size                6.0
 Name: 0, dtype: object)

In [63]:
# Percorrendo o dataframe e imprimindo o indice e cada linha.

for indice, linha in dataset.head().iterrows():
    print(indice, linha)

0 id                    7129300520
date             20141013T000000
price                   221900.0
bedrooms                     3.0
bathrooms                    1.0
sqft_living                 1180
sqft_lot                    5650
floors                       1.0
waterfront                     0
view                           0
condition                      3
grade                          7
sqft_above                  1180
sqft_basement                  0
yr_built                    1955
yr_renovated                   0
zipcode                    98178
lat                      47.5112
long                    -122.257
sqft_living15               1340
sqft_lot15                  5650
bedrooms_size                6.0
Name: 0, dtype: object
1 id                    6414100192
date             20141209T000000
price                   538000.0
bedrooms                     3.0
bathrooms                   2.25
sqft_living                 2570
sqft_lot                    7242
floors          

In [64]:
# Percorrendo o dataframe e acessando colunas nomes.

for indice, linha in dataset.head(10).iterrows():
    print(indice, linha['bedrooms'], linha['floors'], linha['price'])

0 3.0 1.0 221900.0
1 3.0 2.0 538000.0
2 2.0 1.0 180000.0
3 4.0 1.0 604000.0
4 3.0 1.0 510000.0
5 4.0 1.0 1225000.0
6 3.0 2.0 257500.0
7 3.0 1.0 291850.0
8 3.0 1.0 229500.0
9 3.0 2.0 323000.0


### Atualizando  DataFrame ao percorrer linha a linha

In [68]:
# Imprime os 5 primeiros valores de preços antes da atualização.

dataset.price.head()

0    221900.0
1    538000.0
2    180000.0
3    604000.0
4    510000.0
Name: price, dtype: float64

In [69]:
# Percorrendo e atualizando linhas de um dataframe.
# Atualiza o valor da coluna PRICE multiplicando seu valor por 2.
# é preciso usar o método at()

for indice, linha in dataset.iterrows():
    dataset.at[indice, 'price'] = linha['price'] * 2

In [70]:
# Visualizando

dataset.price.head()

0     443800.0
1    1076000.0
2     360000.0
3    1208000.0
4    1020000.0
Name: price, dtype: float64

### Percorrendo um dataframe com o metodo itertuples()

- Retorna as linhas e índice em formato de tuplas.
- Costuma ser mais rápido que o iterrows()

In [77]:
# Percorre o dataframe usando itertuples()

for linha in dataset.head().itertuples():
    print(linha, end='\n\n')

Pandas(Index=0, id=7129300520, date='20141013T000000', price=443800.0, bedrooms=3.0, bathrooms=1.0, sqft_living=1180, sqft_lot=5650, floors=1.0, waterfront=0, view=0, condition=3, grade=7, sqft_above=1180, sqft_basement=0, yr_built=1955, yr_renovated=0, zipcode=98178, lat=47.5112, long=-122.257, sqft_living15=1340, sqft_lot15=5650, bedrooms_size=6.0)

Pandas(Index=1, id=6414100192, date='20141209T000000', price=1076000.0, bedrooms=3.0, bathrooms=2.25, sqft_living=2570, sqft_lot=7242, floors=2.0, waterfront=0, view=0, condition=3, grade=7, sqft_above=2170, sqft_basement=400, yr_built=1951, yr_renovated=1991, zipcode=98125, lat=47.721, long=-122.319, sqft_living15=1690, sqft_lot15=7639, bedrooms_size=6.0)

Pandas(Index=2, id=5631500400, date='20150225T000000', price=360000.0, bedrooms=2.0, bathrooms=1.0, sqft_living=770, sqft_lot=10000, floors=1.0, waterfront=0, view=0, condition=3, grade=6, sqft_above=770, sqft_basement=0, yr_built=1933, yr_renovated=0, zipcode=98028, lat=47.7379, long=

In [78]:
# Imprime linhas chamando as colunas por nome.

for linha in dataset.head().itertuples():
    print(linha.id, linha.bedrooms, linha.price)

7129300520 3.0 443800.0
6414100192 3.0 1076000.0
5631500400 2.0 360000.0
2487200875 4.0 1208000.0
1954400510 3.0 1020000.0


# Missing Values

* **Missing Values** são valores faltantes em colunas, esses podem ser oriundos de falhas em cargas de dados, falhas em crawlers ou até mesmo corrupção de dados.

* Missing Values podem ser um problema em várias situações, como por exemplo, algoritmos de machine learning que não trabalham bem com dados faltantes.

* Estes também podem atrapalhar resultados de análises.

* Vamos aprender como encontrar missing values na base de dados e como manipular esses valores.

In [83]:
arquivo = 'dados/kc_house_data.csv'
dataset = pd.read_csv(arquivo, sep=',', header=0)
dataset

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3.0,1.00,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3.0,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.7210,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2.0,1.00,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4.0,3.00,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3.0,2.00,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21608,263000018,20140521T000000,360000.0,3.0,2.50,1530,1131,3.0,0,0,...,8,1530,0,2009,0,98103,47.6993,-122.346,1530,1509
21609,6600060120,20150223T000000,400000.0,4.0,2.50,2310,5813,2.0,0,0,...,8,2310,0,2014,0,98146,47.5107,-122.362,1830,7200
21610,1523300141,20140623T000000,402101.0,2.0,0.75,1020,1350,2.0,0,0,...,7,1020,0,2009,0,98144,47.5944,-122.299,1020,2007
21611,291310100,20150116T000000,400000.0,3.0,2.50,1600,2388,2.0,0,0,...,8,1600,0,2004,0,98027,47.5345,-122.069,1410,1287


In [89]:
# Consultando linhas com valores faltantes.

dataset.isnull().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

In [81]:
#Com este comando removemos todas as linhas onde tenha pela menos um registro faltante em algum atributo.

dataset.dropna(inplace=True) # Logo Apos executa essa celula volte para celula de cima

In [85]:
# É possível ainda, remover somente linhas que estejam com valores faltantes em todas as colunas, veja:

dataset.dropna(how='all', inplace=True) # Caso tenha um valor faltando em todas as linhas ao mesmo tempo removera

In [87]:
#preenche com a media dos valores da coluns floors os values null

dataset['floors'].fillna(dataset['floors'].mean(), inplace=True)

In [88]:
# preenche com 1 os valores null da coluna bedrooms

dataset['bedrooms'].fillna(1, inplace=True)

# Visualizando dados

In [92]:
# Plota em um gŕafico de barras o preço dos imóveis

%matplotlib notebook
dataset['price'].plot()

<IPython.core.display.Javascript object>

<AxesSubplot:>

In [93]:
# Plota em gráfico de dispersão o preço e o numero de quartos de imóveis

dataset.plot(x='bedrooms', y='price', kind='scatter', title='Bedrooms X Price', color='r')

<IPython.core.display.Javascript object>

<AxesSubplot:title={'center':'Bedrooms X Price'}, xlabel='bedrooms', ylabel='price'>

In [94]:
# Plota em gráfico de dispersão o preço e o número de banheiros

dataset.plot(x='bathrooms', y='price', kind='scatter', color='y')

<IPython.core.display.Javascript object>

<AxesSubplot:xlabel='bathrooms', ylabel='price'>