In [1]:
import numpy as np
import pandas as pd
from openpyxl import load_workbook
from openpyxl.pivot.fields import Missing

In [2]:
file_path = '../data/vendas-combustiveis-m3.xlsx'

In [3]:
workbook = load_workbook(file_path)
worksheet = workbook['Plan1']

In [5]:
pivot_name = 'Tabela dinâmica3'

In [6]:
# Extract the pivot table object from the worksheet
pivot_table = [p for p in worksheet._pivots if p.name == pivot_name][0]


In [7]:
# Extract a dict of all cache fields and their respective values
fields_map = {}
for field in pivot_table.cache.cacheFields:
    if field.sharedItems.count > 0:
        # take care of cases where f.v returns an AttributeError because the cell is empty
        # fields_map[field.name] = [f.v for f in field.sharedItems._fields]
        l = []
        for f in field.sharedItems._fields:
            try:
                l += [f.v]
            except AttributeError:
                l += [""]
        fields_map[field.name] = l


In [8]:
# Extract all rows from cache records. Each row is initially parsed as a dict
column_names = [field.name for field in pivot_table.cache.cacheFields]


In [9]:
rows = []
for record in pivot_table.cache.records.r:
    # If some field in the record in missing, we replace it by NaN
    record_values = [
        field.v if not isinstance(field, Missing) else np.nan for field in record._fields
    ]

    row_dict = {k: v for k, v in zip(column_names, record_values)}

    # Shared fields are mapped as an Index, so we replace the field index by its value
    for key in fields_map:
        row_dict[key] = fields_map[key][row_dict[key]]

    rows.append(row_dict)


In [10]:
df = pd.DataFrame.from_dict(rows)
df

Unnamed: 0,COMBUSTÍVEL,ANO,REGIÃO,ESTADO,Jan,Fev,Mar,Abr,Mai,Jun,Jul,Ago,Set,Out,Nov,Dez,TOTAL
0,ÓLEO DIESEL S-10 (m3),2013.0,REGIÃO NORTE,RONDÔNIA,81453.67,3517.6,3681.7,4700.67,5339.2,6166.4,6539.65,7283.7,8082.85,7902.55,9383.15,9767.4,9088.8
1,ÓLEO DIESEL S-10 (m3),2013.0,REGIÃO NORTE,ACRE,1483.0,11202.0,363.0,410.0,536.0,607.0,740.0,756.0,971.0,1174.0,1240.0,1439.0,1483.0
2,ÓLEO DIESEL S-10 (m3),2013.0,REGIÃO NORTE,AMAZONAS,6836.3,6784.232,61443.832,3190.585,3305.0,3391.0,3637.0,4250.0,4576.0,5756.879,6228.636,6334.0,7154.2
3,ÓLEO DIESEL S-10 (m3),2013.0,REGIÃO NORTE,RORAIMA,1475.3,1502.7,1531.8,13423.7,795.4,757.2,939.8,1040.6,966.0,992.9,1027.0,1083.8,1311.2
4,ÓLEO DIESEL S-10 (m3),2013.0,REGIÃO NORTE,PARÁ,40913.48,45383.5,44013.219,41975.03,441140.785,30137.8,28146.3,31280.5,33033.05,33519.88,34321.53,37168.16,41248.336
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1075,ÓLEO DIESEL (OUTROS ) (m3),2020.0,REGIÃO SUL,RIO GRANDE DO SUL,0.0,0.0,10.0,10.0,0.0,10.0,,,,60.0,10.0,10.0,10.0
1076,ÓLEO DIESEL (OUTROS ) (m3),2020.0,REGIÃO CENTRO-OESTE,MATO GROSSO DO SUL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,0.0,0.0,0.0
1077,ÓLEO DIESEL (OUTROS ) (m3),2020.0,REGIÃO CENTRO-OESTE,MATO GROSSO,45.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,,90.0,45.0
1078,ÓLEO DIESEL (OUTROS ) (m3),2020.0,REGIÃO CENTRO-OESTE,GOIÁS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.4,,,,41.4


In [68]:
df.columns

Index(['COMBUSTÍVEL', 'ANO', 'REGIÃO', 'ESTADO', 'Jan', 'Fev', 'Mar', 'Abr',
       'Mai', 'Jun', 'Jul', 'Ago', 'Set', 'Out', 'Nov', 'Dez', 'TOTAL'],
      dtype='object')

In [69]:
df.shape

(4536, 17)

In [72]:
df.dtypes

COMBUSTÍVEL     object
ANO            float64
ESTADO          object
Jan             object
Fev             object
Mar             object
Abr             object
Mai             object
Jun             object
Jul             object
Ago             object
Set             object
Out             object
Nov             object
Dez             object
dtype: object

In [71]:
cols_to_drop = ['TOTAL','REGIÃO']
df.drop(columns=cols_to_drop, inplace=True)

In [74]:
dict_m = {"Jan": 1, "Fev": 2, "Mar": 3, "Abr": 4, "Mai": 5,"Jun": 6, "Jul": 7, "Ago": 8, "Set": 9, "Out": 10, "Nov": 11, "Dez": 12}

In [75]:
df = pd.melt(df, id_vars = ["COMBUSTÍVEL", "ANO", "ESTADO"], value_vars = dict_m.keys(), var_name = "month", value_name = "volume")


In [76]:
df

Unnamed: 0,COMBUSTÍVEL,ANO,ESTADO,month,volume
0,GASOLINA C (m3),2000.0,RONDÔNIA,Jan,136073.253
1,GASOLINA C (m3),2000.0,ACRE,Jan,3358.346
2,GASOLINA C (m3),2000.0,AMAZONAS,Jan,20766.918
3,GASOLINA C (m3),2000.0,RORAIMA,Jan,3716.032
4,GASOLINA C (m3),2000.0,PARÁ,Jan,29755.907
...,...,...,...,...,...
54427,GLP (m3),2020.0,RIO GRANDE DO SUL,Dez,64045.161232
54428,GLP (m3),2020.0,MATO GROSSO DO SUL,Dez,16281.139493
54429,GLP (m3),2020.0,MATO GROSSO,Dez,18321.987319
54430,GLP (m3),2020.0,GOIÁS,Dez,46850.585145


In [78]:
df['unit'] = 'm3'

from datetime import datetime
# timestamp = datetime.now()
# timestamp_series = pd.to_datetime(timestamp)
df['created_at'] = datetime.now()

In [82]:
df.rename(columns={'COMBUSTÍVEL': 'product','ESTADO': 'uf'},inplace=True)

In [83]:
df

Unnamed: 0,product,ANO,uf,month,volume,unit,created_at
0,GASOLINA C (m3),2000.0,RONDÔNIA,Jan,136073.253,m3,2023-10-10 13:32:01.584206
1,GASOLINA C (m3),2000.0,ACRE,Jan,3358.346,m3,2023-10-10 13:32:01.584206
2,GASOLINA C (m3),2000.0,AMAZONAS,Jan,20766.918,m3,2023-10-10 13:32:01.584206
3,GASOLINA C (m3),2000.0,RORAIMA,Jan,3716.032,m3,2023-10-10 13:32:01.584206
4,GASOLINA C (m3),2000.0,PARÁ,Jan,29755.907,m3,2023-10-10 13:32:01.584206
...,...,...,...,...,...,...,...
54427,GLP (m3),2020.0,RIO GRANDE DO SUL,Dez,64045.161232,m3,2023-10-10 13:32:01.584206
54428,GLP (m3),2020.0,MATO GROSSO DO SUL,Dez,16281.139493,m3,2023-10-10 13:32:01.584206
54429,GLP (m3),2020.0,MATO GROSSO,Dez,18321.987319,m3,2023-10-10 13:32:01.584206
54430,GLP (m3),2020.0,GOIÁS,Dez,46850.585145,m3,2023-10-10 13:32:01.584206


In [84]:
df.dtypes

product               object
ANO                  float64
uf                    object
month                 object
volume                object
unit                  object
created_at    datetime64[ns]
dtype: object

In [92]:
df['year_month'] = (df['ANO'].astype(int).astype(str) + '-' + df['month'].replace(dict_m).astype(str))

In [94]:
df['year_month'] = pd.to_datetime(df['year_month'], format='%Y-%m')


In [96]:
columns_order = ['year_month','uf','product','unit','volume','created_at']
df = df[columns_order]


In [99]:
df.dtypes

year_month    datetime64[ns]
uf                    object
product               object
unit                  object
volume                object
created_at    datetime64[ns]
dtype: object

In [100]:
df

Unnamed: 0,year_month,uf,product,unit,volume,created_at
0,2000-01-01,RONDÔNIA,GASOLINA C (m3),m3,136073.253,2023-10-10 13:32:01.584206
1,2000-01-01,ACRE,GASOLINA C (m3),m3,3358.346,2023-10-10 13:32:01.584206
2,2000-01-01,AMAZONAS,GASOLINA C (m3),m3,20766.918,2023-10-10 13:32:01.584206
3,2000-01-01,RORAIMA,GASOLINA C (m3),m3,3716.032,2023-10-10 13:32:01.584206
4,2000-01-01,PARÁ,GASOLINA C (m3),m3,29755.907,2023-10-10 13:32:01.584206
...,...,...,...,...,...,...
54427,2020-12-01,RIO GRANDE DO SUL,GLP (m3),m3,64045.161232,2023-10-10 13:32:01.584206
54428,2020-12-01,MATO GROSSO DO SUL,GLP (m3),m3,16281.139493,2023-10-10 13:32:01.584206
54429,2020-12-01,MATO GROSSO,GLP (m3),m3,18321.987319,2023-10-10 13:32:01.584206
54430,2020-12-01,GOIÁS,GLP (m3),m3,46850.585145,2023-10-10 13:32:01.584206


In [104]:
len(df['product'].unique().tolist())

8

In [103]:
12*21

252

In [57]:
df['ESTADO'] = df['ESTADO'].astype('string')

In [58]:
df.dtypes

COMBUSTÍVEL     string
ANO            float64
ESTADO          string
Jan             object
Fev             object
Mar             object
Abr             object
Mai             object
Jun             object
Jul             object
Ago             object
Set             object
Out             object
Nov             object
Dez             object
dtype: object

In [59]:
df['unit'] = 'm3'


In [60]:
df['unit'] = df['unit'].astype('string')

In [61]:
from datetime import datetime
timestamp = datetime.now()
timestamp_series = pd.to_datetime(timestamp)
df['created_at'] = timestamp_series

In [62]:
df.dtypes

COMBUSTÍVEL            string
ANO                   float64
ESTADO                 string
Jan                    object
Fev                    object
Mar                    object
Abr                    object
Mai                    object
Jun                    object
Jul                    object
Ago                    object
Set                    object
Out                    object
Nov                    object
Dez                    object
unit                   string
created_at     datetime64[ns]
dtype: object

In [63]:
MONTH_DICT = {
    "Jan": 1, 
    "Fev": 2, 
    "Mar": 3, 
    "Abr": 4, 
    "Mai": 5,
    "Jun": 6, 
    "Jul": 7, 
    "Ago": 8, 
    "Set": 9, 
    "Out": 10, 
    "Nov": 11, 
    "Dez": 12
}

In [64]:
df = pd.melt(df, id_vars = ["ANO", "ESTADO", "COMBUSTÍVEL", "unit", "created_at"], value_vars = MONTH_DICT.keys(), var_name = "month", value_name = "volume")


In [49]:
df2 = pd.melt(df, id_vars = ["ANO", "ESTADO", "COMBUSTÍVEL", "unit","created_at"], var_name = "month", value_name = "volume")


In [35]:
df2

Unnamed: 0,ANO,ESTADO,COMBUSTÍVEL,unit,month,volume
0,2000.0,RONDÔNIA,GASOLINA C (m3),m3,month,Jan
1,2000.0,ACRE,GASOLINA C (m3),m3,month,Jan
2,2000.0,AMAZONAS,GASOLINA C (m3),m3,month,Jan
3,2000.0,RORAIMA,GASOLINA C (m3),m3,month,Jan
4,2000.0,PARÁ,GASOLINA C (m3),m3,month,Jan
...,...,...,...,...,...,...
108859,2020.0,RIO GRANDE DO SUL,GLP (m3),m3,volume,64045.161232
108860,2020.0,MATO GROSSO DO SUL,GLP (m3),m3,volume,16281.139493
108861,2020.0,MATO GROSSO,GLP (m3),m3,volume,18321.987319
108862,2020.0,GOIÁS,GLP (m3),m3,volume,46850.585145


In [65]:
df

Unnamed: 0,ANO,ESTADO,COMBUSTÍVEL,unit,created_at,month,volume
0,2000.0,RONDÔNIA,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,Jan,136073.253
1,2000.0,ACRE,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,Jan,3358.346
2,2000.0,AMAZONAS,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,Jan,20766.918
3,2000.0,RORAIMA,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,Jan,3716.032
4,2000.0,PARÁ,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,Jan,29755.907
...,...,...,...,...,...,...,...
54427,2020.0,RIO GRANDE DO SUL,GLP (m3),m3,2023-10-10 13:07:50.805680,Dez,64045.161232
54428,2020.0,MATO GROSSO DO SUL,GLP (m3),m3,2023-10-10 13:07:50.805680,Dez,16281.139493
54429,2020.0,MATO GROSSO,GLP (m3),m3,2023-10-10 13:07:50.805680,Dez,18321.987319
54430,2020.0,GOIÁS,GLP (m3),m3,2023-10-10 13:07:50.805680,Dez,46850.585145


In [66]:
df.replace({"month": MONTH_DICT})

Unnamed: 0,ANO,ESTADO,COMBUSTÍVEL,unit,created_at,month,volume
0,2000.0,RONDÔNIA,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,1,136073.253
1,2000.0,ACRE,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,1,3358.346
2,2000.0,AMAZONAS,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,1,20766.918
3,2000.0,RORAIMA,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,1,3716.032
4,2000.0,PARÁ,GASOLINA C (m3),m3,2023-10-10 13:07:50.805680,1,29755.907
...,...,...,...,...,...,...,...
54427,2020.0,RIO GRANDE DO SUL,GLP (m3),m3,2023-10-10 13:07:50.805680,12,64045.161232
54428,2020.0,MATO GROSSO DO SUL,GLP (m3),m3,2023-10-10 13:07:50.805680,12,16281.139493
54429,2020.0,MATO GROSSO,GLP (m3),m3,2023-10-10 13:07:50.805680,12,18321.987319
54430,2020.0,GOIÁS,GLP (m3),m3,2023-10-10 13:07:50.805680,12,46850.585145


In [106]:
df.dtypes

year_month    datetime64[ns]
uf                    object
product               object
unit                  object
volume                object
created_at    datetime64[ns]
dtype: object

In [109]:
df['uf'] = df['uf'].astype('string')
df['product'] = df['product'].astype('string')
df['unit'] = df['unit'].astype('string')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['uf'] = df['uf'].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['product'] = df['product'].astype('string')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['unit'] = df['unit'].astype('string')


In [128]:
df.dtypes

year_month    datetime64[ns]
uf                    string
product               string
unit                  string
volume               float64
created_at    datetime64[ns]
dtype: object

In [127]:
# df['volume'] = df['volume'].astype(float)
# df['volume'] = df['volume'].astype(float)
df['volume'] = pd.to_numeric(df['volume'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['volume'] = pd.to_numeric(df['volume'])


In [116]:
import numpy as np
# df[column] = df[column].astype(np.float64)

# df['volume'] = df['volume'].astype(np.float64)

ValueError: could not convert string to float: ''

In [137]:
df

Unnamed: 0,year_month,uf,product,unit,volume,created_at
0,2000-01-01,RONDÔNIA,GASOLINA C (m3),m3,136073.253000,2023-10-10 13:32:01.584206
1,2000-01-01,ACRE,GASOLINA C (m3),m3,3358.346000,2023-10-10 13:32:01.584206
2,2000-01-01,AMAZONAS,GASOLINA C (m3),m3,20766.918000,2023-10-10 13:32:01.584206
3,2000-01-01,RORAIMA,GASOLINA C (m3),m3,3716.032000,2023-10-10 13:32:01.584206
4,2000-01-01,PARÁ,GASOLINA C (m3),m3,29755.907000,2023-10-10 13:32:01.584206
...,...,...,...,...,...,...
54427,2020-12-01,RIO GRANDE DO SUL,GLP (m3),m3,64045.161232,2023-10-10 13:32:01.584206
54428,2020-12-01,MATO GROSSO DO SUL,GLP (m3),m3,16281.139493,2023-10-10 13:32:01.584206
54429,2020-12-01,MATO GROSSO,GLP (m3),m3,18321.987319,2023-10-10 13:32:01.584206
54430,2020-12-01,GOIÁS,GLP (m3),m3,46850.585145,2023-10-10 13:32:01.584206


In [139]:
output_dir = '../data/partitioned_parquet_data'

# Specify the columns to use for partitioning
partition_cols = ['year_month']

# Save the DataFrame to Parquet with partitioning
df.to_parquet(output_dir, partition_cols=partition_cols, engine='pyarrow',compression='snappy')


In [130]:
import pyarrow as pa
import pyarrow.parquet as pq

In [131]:
tab = pa.Table.from_pandas(df)


In [140]:
pq.write_table(tab, file_path, compression='SNAPPY')


In [141]:
pq.write_table(tab, './df_pq_snappy', compression='SNAPPY')

In [143]:
df.to_parquet('./df_to_parquet_p_uf',partition_cols=['uf'],engine='pyarrow')
# df.to_parquet(output_dir, partition_cols=partition_cols, engine='pyarrow')


In [145]:
import pandas as pd

In [146]:
df_oil = pd.read_parquet('../data/oil_parquet')

In [147]:
df_oil

Unnamed: 0,year_month,uf,product,unit,volume,created_at
0,2000-01-01,RONDÔNIA,GASOLINA C (m3),m3,136073.253000,2023-10-10 21:15:12.988322
1,2000-01-01,ACRE,GASOLINA C (m3),m3,3358.346000,2023-10-10 21:15:12.988322
2,2000-01-01,AMAZONAS,GASOLINA C (m3),m3,20766.918000,2023-10-10 21:15:12.988322
3,2000-01-01,RORAIMA,GASOLINA C (m3),m3,3716.032000,2023-10-10 21:15:12.988322
4,2000-01-01,PARÁ,GASOLINA C (m3),m3,29755.907000,2023-10-10 21:15:12.988322
...,...,...,...,...,...,...
54427,2020-12-01,RIO GRANDE DO SUL,GLP (m3),m3,64045.161232,2023-10-10 21:15:12.988322
54428,2020-12-01,MATO GROSSO DO SUL,GLP (m3),m3,16281.139493,2023-10-10 21:15:12.988322
54429,2020-12-01,MATO GROSSO,GLP (m3),m3,18321.987319,2023-10-10 21:15:12.988322
54430,2020-12-01,GOIÁS,GLP (m3),m3,46850.585145,2023-10-10 21:15:12.988322


In [148]:
df_oil = df_oil[['year_month','volume']]

In [149]:
# df_oil = df_oil.groupby('year_month').sum()

In [150]:
df_oil['ano'] = df_oil['year_month'].dt.year

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oil['ano'] = df_oil['year_month'].dt.year


In [136]:
df_oil.drop(columns=['year_month'],inplace=True)
# df_oil_acre.drop(columns=['mês'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_oil.drop(columns=['year_month'],inplace=True)


In [137]:
# df_oil.drop
df_oil = df_oil.pivot_table(columns='ano',aggfunc='sum')

In [138]:
s_oil_parquet = df_oil.iloc[:].squeeze()


In [139]:
s_oil_parquet

ano
2000    1.615858e+08
2001    1.679425e+08
2002    1.635339e+08
2003    1.529466e+08
2004    1.644017e+08
2005    1.651756e+08
2006    1.649343e+08
2007    1.814908e+08
2008    1.966418e+08
2009    2.002764e+08
2010    2.126558e+08
2011    2.305579e+08
2012    2.395704e+08
2013    2.462708e+08
2014    2.707333e+08
2015    2.638810e+08
2016    2.476081e+08
2017    2.513247e+08
2018    2.543917e+08
2019    2.579250e+08
2020    1.777013e+08
Name: volume, dtype: float64

In [104]:
new_index = [i for i in range(0,21)]

In [107]:
new_index

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]

In [110]:
# s_oil_parquet = s_oil_parquet.set_index(pd.Index(new_index))
new_s = pd.Series(s_oil_parquet.values, index=new_index)
new_s


0     1.615858e+08
1     1.679425e+08
2     1.635339e+08
3     1.529466e+08
4     1.644017e+08
5     1.651756e+08
6     1.649343e+08
7     1.814908e+08
8     1.966418e+08
9     2.002764e+08
10    2.126558e+08
11    2.305579e+08
12    2.395704e+08
13    2.462708e+08
14    2.707333e+08
15    2.638810e+08
16    2.476081e+08
17    2.513247e+08
18    2.543917e+08
19    2.579250e+08
20    1.777013e+08
dtype: float64

In [158]:
df_total = pd.read_excel('/home/felipe/Desktop/vendas-combustiveis-m3.xlsx',sheet_name='Plan1',usecols='C:W', nrows=12, skiprows=52)

In [159]:
l = [2000+i for i in range(0,21)]
string_list = []
for year in l:
    string_list.append(str(year))

In [160]:
string_list

['2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020']

In [161]:
df_total.columns = string_list

In [162]:
df_total

Unnamed: 0,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
0,13646370.0,18531200.0,11217900.0,12657560.0,15604470.0,12615950.0,12116560.0,17213900.0,15031180.0,15475390.0,...,22481750.0,14870280.0,21186260.0,29431940.0,18112000.0,18847600.0,23577750.0,21342610.0,18342970.0,15135070.0
1,9841822.0,13767930.0,18522390.0,10736060.0,13241080.0,15652010.0,12836560.0,12773960.0,18193050.0,15429200.0,...,19124450.0,23060040.0,15774360.0,22400240.0,29365840.0,16952710.0,19003800.0,23655850.0,21594680.0,12306950.0
2,13706080.0,9756567.0,13759530.0,17636350.0,11152250.0,13241920.0,15957950.0,13663110.0,13616760.0,18160180.0,...,17642990.0,20353400.0,24629750.0,16692800.0,21677400.0,27846520.0,16688180.0,19260300.0,24271360.0,14853620.0
3,13468110.0,13221150.0,9766751.0,13152390.0,18543580.0,11390770.0,13633600.0,17111340.0,15190610.0,14040940.0,...,17343280.0,18550400.0,21722040.0,26498570.0,15903460.0,20897100.0,28406590.0,16726550.0,19651570.0,16396730.0
4,13706600.0,13419480.0,13367320.0,9368968.0,13847930.0,18589850.0,11547060.0,14291040.0,18265860.0,15687250.0,...,21690140.0,18553050.0,19669340.0,23111420.0,27638800.0,15281840.0,21332980.0,28973930.0,17034170.0,13283730.0
5,12907530.0,13815850.0,13197850.0,12612060.0,9785203.0,13853680.0,18645010.0,12417310.0,15294880.0,18602890.0,...,15892810.0,23360330.0,19825880.0,20668280.0,22022960.0,26268060.0,15436250.0,21141660.0,30013570.0,11214960.0
6,17492550.0,12606800.0,13333810.0,12724190.0,14136940.0,10013640.0,14103800.0,20198890.0,13584890.0,15315830.0,...,18244620.0,17135230.0,24283740.0,21154850.0,20098350.0,20759400.0,25767300.0,15610670.0,21563700.0,20328940.0
7,14516030.0,17465430.0,12903470.0,12680890.0,13622300.0,13991210.0,10227140.0,15147540.0,21896790.0,14100640.0,...,21462190.0,19298590.0,17799620.0,25283470.0,20827530.0,19427410.0,20394850.0,27600640.0,16286330.0,15075440.0
8,12231700.0,14068840.0,16732920.0,12092930.0,13292610.0,13563690.0,15558210.0,11158840.0,16231260.0,22310860.0,...,17634360.0,22476780.0,20532840.0,18631080.0,24088410.0,19241440.0,19193350.0,19943250.0,29613730.0,10702320.0
9,15210190.0,12177920.0,13355960.0,15543120.0,12602810.0,13228670.0,14027790.0,17997270.0,12242860.0,16193900.0,...,15729590.0,18407660.0,23774830.0,21672720.0,18441650.0,23654860.0,19157580.0,19466730.0,20320100.0,20174550.0


In [163]:
df_s =[]

In [164]:
for col in df_total.columns:
    # print(col)
    # print(type(col))
    df_s.append(df_total[col].sum())


In [165]:
df_ss = pd.Series(df_s)
df_ss = df_ss.rename_axis('Year')
df_ss.rename('Volume Total')
df_ss

Year
0     1.615858e+08
1     1.679425e+08
2     1.635339e+08
3     1.529466e+08
4     1.644017e+08
5     1.651756e+08
6     1.649343e+08
7     1.814908e+08
8     1.966418e+08
9     2.002764e+08
10    2.126558e+08
11    2.305579e+08
12    2.395704e+08
13    2.462708e+08
14    2.707333e+08
15    2.638810e+08
16    2.476081e+08
17    2.513247e+08
18    2.543917e+08
19    2.579250e+08
20    1.777013e+08
dtype: float64

In [166]:
relative_dif = (new_s - df_ss)/df_ss

In [167]:
relative_dif

0    -9.221824e-16
1    -7.098220e-16
2     7.289579e-16
3    -3.897090e-16
4    -9.063873e-16
5    -5.412842e-16
6    -1.264845e-15
7     6.568339e-16
8    -1.515564e-16
9     5.952240e-16
10    9.810041e-16
11   -3.877853e-16
12   -4.975960e-16
13   -3.630433e-16
14    0.000000e+00
15   -3.388155e-16
16   -1.203608e-16
17   -2.371619e-16
18    0.000000e+00
19   -6.932789e-16
20   -1.677102e-16
dtype: float64

In [168]:
greater_than_minimum = dif < 0.0000001
greater_than_minimum.sum() == 21


True

In [26]:
df_total = df_total.rename_axis('volume')

In [37]:
13646370.0414861+9841822.23001855+13706076.1982171+13468112.6608275+13706596.1938163+12907529.0797421+17492550.3301299+14516026.2260056+12231702.2010891+15210190.1900501+13756003.61218+11102857.3312356


161585836.29479793

In [38]:
161585836.29479793 == 

False

In [39]:
1.615858e+08	

161585800.0