\
**Data Analysis with Python and Pandas**

In [39]:
import pandas as pd

In [40]:
df1 = pd.read_excel('/content/Aracaju.xlsx')
df2 = pd.read_excel('/content/Fortaleza.xlsx')
df3 = pd.read_excel('/content/Natal.xlsx')
df4 = pd.read_excel('/content/Recife.xlsx')
df5 = pd.read_excel('/content/Salvador.xlsx')

In [41]:
df = pd.concat([df1, df2, df3, df4, df5])

In [42]:
df.head()

Unnamed: 0,Cidade,Data,Vendas,LojaID,Qtde
0,Aracaju,2018-01-01,142.0,1520,1
1,Aracaju,2018-01-01,14.21,1522,6
2,Aracaju,2018-01-01,71.55,1520,1
3,Aracaju,2018-01-01,3.01,1521,7
4,Aracaju,2018-01-01,24.51,1522,8


In [43]:
df.tail()

Unnamed: 0,Cidade,Data,Vendas,LojaID,Qtde
235,Salvador,2019-01-03,,1034,1
236,Salvador,2019-01-03,,1035,3
237,Salvador,2019-01-03,38.06,1036,3
238,Salvador,2019-01-03,139.64,1035,1
239,Salvador,2019-01-03,161.41,1037,3


In [44]:
df.sample(4)

Unnamed: 0,Cidade,Data,Vendas,LojaID,Qtde
63,Natal,2018-02-12,552.0,852,2
120,Fortaleza,2019-03-02,184.9,982,4
137,Salvador,2019-03-02,51.66,1036,3
21,Fortaleza,2019-01-01,40.63,1004,3


In [45]:
df.dtypes

Cidade            object
Data      datetime64[ns]
Vendas           float64
LojaID             int64
Qtde               int64
dtype: object

In [46]:
# Change Type
df['LojaID'] = df['LojaID'].astype('object')

In [47]:
df.dtypes

Cidade            object
Data      datetime64[ns]
Vendas           float64
LojaID            object
Qtde               int64
dtype: object

In [48]:
df.head()

Unnamed: 0,Cidade,Data,Vendas,LojaID,Qtde
0,Aracaju,2018-01-01,142.0,1520,1
1,Aracaju,2018-01-01,14.21,1522,6
2,Aracaju,2018-01-01,71.55,1520,1
3,Aracaju,2018-01-01,3.01,1521,7
4,Aracaju,2018-01-01,24.51,1522,8


**Handling missing values**

In [49]:
df.isnull().sum()

Cidade    0
Data      0
Vendas    5
LojaID    0
Qtde      0
dtype: int64

In [50]:
# Replace missing values by mean 
df['Vendas'].fillna(df['Vendas'].mean(), inplace=True)

In [51]:
df.isnull().sum()

Cidade    0
Data      0
Vendas    0
LojaID    0
Qtde      0
dtype: int64

In [52]:
df['Vendas'].mean()

122.81588301462315

In [53]:
# Replace missing values by 0 
df['Vendas'].fillna(0, inplace=True)

In [54]:
# Delete lines with missing values
df.dropna(inplace=True)

In [55]:
# Delete lines with missing values in one column
df.dropna(subset=['Vendas'], inplace=True)

In [56]:
# Delete lines with missing values in all columns
df.dropna(how='all', inplace=True)

**Creating new tables**

In [57]:
# Create revenue column
df['Receita'] = df['Vendas'].mul(df['Qtde'])

In [58]:
df.head()

Unnamed: 0,Cidade,Data,Vendas,LojaID,Qtde,Receita
0,Aracaju,2018-01-01,142.0,1520,1,142.0
1,Aracaju,2018-01-01,14.21,1522,6,85.26
2,Aracaju,2018-01-01,71.55,1520,1,71.55
3,Aracaju,2018-01-01,3.01,1521,7,21.07
4,Aracaju,2018-01-01,24.51,1522,8,196.08


In [59]:
# Create table Qtde
df['Receita/Vendas'] = df['Receita'] / df['Vendas'] 

In [60]:
df.head()

Unnamed: 0,Cidade,Data,Vendas,LojaID,Qtde,Receita,Receita/Vendas
0,Aracaju,2018-01-01,142.0,1520,1,142.0,1.0
1,Aracaju,2018-01-01,14.21,1522,6,85.26,6.0
2,Aracaju,2018-01-01,71.55,1520,1,71.55,1.0
3,Aracaju,2018-01-01,3.01,1521,7,21.07,7.0
4,Aracaju,2018-01-01,24.51,1522,8,196.08,8.0


In [61]:
# Return the largest revenue
df['Receita'].max()

3544.0

In [62]:
# Return the smallest revenue
df['Receita'].min()

3.34

In [63]:
#nlargest (return 03 largest values)
df.nlargest(3, 'Receita')

Unnamed: 0,Cidade,Data,Vendas,LojaID,Qtde,Receita,Receita/Vendas
7,Natal,2019-03-18,886.0,853,4,3544.0,4.0
51,Natal,2018-01-21,859.0,852,4,3436.0,4.0
55,Natal,2019-01-08,859.0,854,4,3436.0,4.0


In [67]:
#nsamllest (return 06 smallest values)
df.nsmallest(6, 'Receita')

Unnamed: 0,Cidade,Data,Vendas,LojaID,Qtde,Receita,Receita/Vendas
118,Aracaju,2018-01-01,3.34,1522,1,3.34,1.0
65,Recife,2019-01-01,4.01,981,1,4.01,1.0
92,Natal,2019-01-02,4.57,1035,1,4.57,1.0
92,Salvador,2019-01-01,4.57,1035,1,4.57,1.0
95,Natal,2019-01-02,5.13,1035,1,5.13,1.0
95,Salvador,2019-01-01,5.13,1035,1,5.13,1.0


In [65]:
# Group by city
df.groupby('Cidade')['Receita'].sum()

Cidade
Aracaju       48748.250000
Fortaleza     37913.970000
Natal        167227.520000
Recife        51936.510000
Salvador      40797.072947
Name: Receita, dtype: float64

In [66]:
# biggest to smallest value
df.sort_values('Receita', ascending=False).head(10)

Unnamed: 0,Cidade,Data,Vendas,LojaID,Qtde,Receita,Receita/Vendas
7,Natal,2019-03-18,886.0,853,4,3544.0,4.0
55,Natal,2019-01-08,859.0,854,4,3436.0,4.0
51,Natal,2018-01-21,859.0,852,4,3436.0,4.0
30,Natal,2018-10-02,856.0,853,4,3424.0,4.0
41,Natal,2018-05-20,835.0,852,4,3340.0,4.0
38,Natal,2018-02-25,828.0,852,4,3312.0,4.0
10,Natal,2018-10-27,828.0,852,4,3312.0,4.0
69,Natal,2019-03-24,817.0,852,4,3268.0,4.0
62,Natal,2018-02-10,793.0,854,4,3172.0,4.0
52,Natal,2018-04-27,778.0,854,4,3112.0,4.0
