# PRE-SETTING


In [None]:
#Per impostare il locale italiano
# 1 Decommentare l'ultima riga ed eseguire la cella
# 2 Selezionare la codifica it_IT.UTF-8 '282'
# 3 Riavviare il runtime ed eseguire questa cella con l'ultima riga commentata

!export LC_ALL="it_IT.UTF-8"
!export LC_CTYPE="it_IT.UTF-8"
#!sudo dpkg-reconfigure locales

In [None]:
import locale
from dask.diagnostics import ProgressBar  
locale.setlocale(locale.LC_ALL, 'it_IT.UTF-8')

'it_IT.UTF-8'

In [None]:
!pip install aiohttp
!pip install requests
!pip install dask[dataframe] --upgrade



Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[?25l[K     |▎                               | 10 kB 23.7 MB/s eta 0:00:01[K     |▋                               | 20 kB 11.9 MB/s eta 0:00:01[K     |▉                               | 30 kB 8.4 MB/s eta 0:00:01[K     |█▏                              | 40 kB 3.7 MB/s eta 0:00:01[K     |█▌                              | 51 kB 3.7 MB/s eta 0:00:01[K     |█▊                              | 61 kB 4.3 MB/s eta 0:00:01[K     |██                              | 71 kB 4.5 MB/s eta 0:00:01[K     |██▎                             | 81 kB 4.8 MB/s eta 0:00:01[K     |██▋                             | 92 kB 5.3 MB/s eta 0:00:01[K     |███                             | 102 kB 4.2 MB/s eta 0:00:01[K     |███▏                            | 112 kB 4.2 MB/s eta 0:00:01[K     |███▌                            | 122 kB 4.2 MB/s eta 0:0

In [None]:
import dask.dataframe as dd
import math

#DEFINING BASE BENCH CLASS

In [None]:
class BaseDfBench(object):
    def __init__(self, df):
      self.df = df

    def load_dataset(self, path, format, conn=None, **kwargs):
        """
        Load the provided dataframe
        """
        if format == "csv":
            self.df = self.read_csv(path, **kwargs)
        elif format == "json":
            self.df = self.read_json(path, **kwargs)
        elif format == "xml":
            self.df = self.read_xml(path, **kwargs)
        elif format == "excel":
            self.df = self.read_excel(path, **kwargs)
        elif format == "parquet":
            self.df = self.read_parquet(path, **kwargs)
        elif format == "sql": 
            self.df = self.read_sql(path, conn, **kwargs)            
        return self.df        
        
    def read_sql(self, query, conn, **kwargs):
        """
        Given a connection and a query
        creates a dataframe from the query output
        """
        self.df = dd.read_sql(query, conn)
        return self.df
    def read_json(self, path, **kwargs):
        """
        Read a json file
        """
        self.df = dd.read_json(path, **kwargs)
        return self.df
    
    def read_csv(self, path, **kwargs):
        """
        Read a csv file
        """
        self.df = dd.read_csv(path, **kwargs)
        return self.df
        
    def read_xml(self, path, **kwargs):
        """
        Read a xml file
        """
        self.df = dd.read_xml(path, **kwargs)
        return self.df
        
    def read_excel(self, path, **kwargs):
        """
        Read an excel file
        """
        self.df = dd.read_excel(path, **kwargs)
        return self.df
        
    def read_parquet(self, path, **kwargs):
        """
        Read a parquet file
        """
        self.df = dd.read_parquet(path, **kwargs)
        return self.df
    def sort(self, columns, ascending=True):
        """
        Sort the dataframe by the provided columns
        Columns is a list of column names
        """
        self.df = self.df.sort_values(columns, ascending=ascending)
        return self.df

    def get_columns(self):
        """
        Return the name of the columns in the dataframe
        """
        return list(self.df.columns.values)

    def is_unique(self, column):
        """
        Check the uniqueness of all values contained in the provided column_name
        """
        return self.df[column].is_unique

    def delete_columns(self, columns):
        """
        Delete the specified columns
        Columns is a list of column names
        """
        self.df = self.df.drop(columns=columns)
        return self.df

    def rename_columns(self, columns):
        """
        Rename the provided columns using the provided names
        Columns is a dictionary: {"column_name": "new_name"}
        """
        self.df = self.df.rename(columns=columns)
        return self.df

    def merge_columns(self, columns, separator, name):
        """
        Create a new column with the provided name combining the two provided columns using the provided separator
        Columns is a list of two column names; separator and name are strings
        """
        self.df[name] = self.df[columns[0]].astype(str) + separator + self.df[columns[1]].astype(str)
        return self.df

    def fill_nan(self, value):
        """
        Fill nan values in the dataframe with the provided value
        """
        self.df = self.df.fillna(value)
        return self.df
        
    def one_hot_encoding(self, columns):
        """
        Performs one-hot encoding of the provided columns
        Columns is a list of column names
        """
        dummies = dd.get_dummies(self.df[columns])
        self.df = dd.concat([self.df.drop(columns=columns), dummies], axis=1)
        return self.df

    def locate_null_values(self, column):
        """
        Returns the rows of the dataframe which contains
        null value in the provided column.
        """
        return self.df[self.df[column].isna()]
    def search_by_pattern(self, column, pattern):
        """
        Returns the rows of the dataframe which
        match with the provided pattern
        on the provided column.
        Pattern could be a regular expression.
        """
        return self.df[self.df[column].str.contains(re.compile(pattern))]
        
    def locate_outliers(self, column, lower_quantile=0.1, upper_quantile=0.99):
        """
        Returns the rows of the dataframe that have values
        in the provided column lower or higher than the values
        of the lower/upper quantile.
        """
        q_low = self.df[column].quantile(lower_quantile)
        q_hi  = self.df[column].quantile(upper_quantile)
        return self.df[(self.df[column] < q_low) | (self.df[column] > q_hi)]
        
    def get_columns_types(self):
        """
        Returns a dictionary with column types
        """
        return self.df.dtypes.apply(lambda x: x.name).to_dict()
        
    def cast_columns_types(self, converterList: list, str_date_time_format = '%d %B %Y'):
        """
        Cast the data types of the provided columns 
        to the provided new data types.
        dtypes is a dictionary that provide for each
        column to cast the new data type.
        """
        columnsDate = []
        for item in converterList:
            if(item['correct_dtype'] == float):
              self.df[item['col']] = self.df[item['col']].str.replace('.', '').str.replace(',', '.').str.extract('([+-]?[0-9]+\.[0-9]+)', expand=False).astype(float)
            elif(item['correct_dtype'] == 'datetime'):
              columnsDate.append(item['col'])
        
        self.change_date_time_format(columnsDate, str_date_time_format = '%d %B %Y')
        return self.df
        
        
    def get_stats(self):
        """
        Returns dataframe statistics.
        Only for numeric columns.
        Min value, max value, average value, standard deviation, and standard quantiles.
        """
        return self.df.describe()
        
    
    def assign_custom_types(self, mismatched_types, correct_types):

      for mismatch in mismatched_types:
        mismatch['correct_dtype'] = correct_types[mismatch['col']]
      
      return mismatched_types


        
    def find_mismatched_dtypes(self):
      """
      L'implementazione originaria non si adattava alla nostra libreria.
      Dask interpreta i mismatch nella colonne come object, per cui facendo un rapido controllo 
      su questo tipo di dato, otteniamo le colonne che dobbiamo andare a sistemare.
      Il risultato è un Set con due liste che riportano rispettivamente i nomi delle colonne con type=object
      e la seconda lista i rispettivi indici.
      """
      current_dtypes = self.get_columns_types()

      out = []
      for k in current_dtypes.keys():
          if current_dtypes[k] == 'object':
            out.append({'col': k, 'current_dtype': current_dtypes[k], 'correct_dtype': ''})
      return out
        
    def check_allowed_char(self, column, pattern):
        """
        Return true if all the values of the provided column
        follow the provided pattern.
        For example, if the pattern [a-z] is provided the string
        'ciao' will return true, the string 'ciao123' will return false.
        """
        return self.df[column].str.contains(re.compile(pattern)).all()
        
    def drop_duplicates(self):
        """
        Drop duplicate rows.
        """
        self.df = self.df.drop_duplicates()
        return self.df
        
    def drop_by_pattern(self, column, pattern):
        """
        Delete the rows where the provided pattern
        occurs in the provided column.
        """
        matching_rows = self.search_by_pattern(column, pattern)
        self.df = self.df.drop(matching_rows.index)
        return self.df
        
    def change_date_time_format(self, columns: list, str_date_time_format):
        """
        Change the date/time format of the provided column
        according to the provided formatting string.
        column datatype must be datetime
        An example of str_date_time_format is '%m/%d/%Y'
        """
        for col in columns:
          self.df[col] = dd.to_datetime(self.df[col], format=str_date_time_format, dayfirst=True, errors='coerce')
        return self.df
        
    def set_header_case(self, case):
        """
        Put dataframe headers in the provided case
        Supported cases: "lower", "upper", "title", "capitalize", "swapcase"
        (see definitions in pandas documentation)
        """
        if mode == "lower":
            self.df.columns = map(str.lower, self.df.columns)
        elif mode == "upper":
            self.df.columns = map(str.upper, self.df.columns)
        elif mode == "title":
            self.df.columns = map(str.title, self.df.columns)
        elif mode == "capitalize":
            self.df.columns = map(str.capitalize, self.df.columns)
        elif mode == "swapcase":
            self.df.columns = map(str.swapcase, self.df.columns)
        return self.df

    def set_content_case(self, columns, case):
        """
        Put dataframe content in the provided case
        Supported cases: "lower", "upper", "title", "capitalize", "swapcase"
        (see definitions in pandas documentation)
        Columns is a list of two column names; empty list for the whole dataframe
        """
        if len(columns) == 0:
            columns = list(self.df.columns.values)
        for column in columns:
            if mode == "lower":
                self.df[column] = self.df[column].str.lower()
            elif mode == "upper":
                self.df[column] = self.df[column].str.upper()
            elif mode == "title":
                self.df[column] = self.df[column].str.title()
            elif mode == "capitalize":
                self.df[column] = self.df[column].str.capitalize()
            elif mode == "swapcase":
                self.df[column] = self.df[column].str.swapcase()
        return self.df

    def duplicate_columns(self, columns):
        """
        Duplicate the provided columns (add to the dataframe with "_duplicate" suffix)
        Columns is a list of column names
        """
        for column in columns:
            self.df[column + "_duplicate"] = self.df[column]
        return self.df

    def pivot(self, index, columns, values, aggfunc):
        """
        Define the lists of columns to be used as index, columns and values respectively,
        and the dictionary to aggregate ("sum", "mean", "count") the values for each column: {"col1": "sum"}
        (see pivot_table in pandas documentation)
        """
        self.df = dd.pivot_table(self.df, index=index, values=values, columns=columns, aggfunc=aggfunc).reset_index()
        return self.df

    def unpivot(self, columns, var_name, val_name):
        """
        Define the list of columns to be used as values for the variable column,
        the name for variable columns and the one for value column_name
        """
        self.df = dd.melt(self.df, id_vars=list(set(list(self.df.columns.values)) - set(columns)), value_vars=columns, var_name=var_name, value_name=val_name)
        return self.df

    def delete_empty_rows(self, columns):
        """
        Delete the rows with null values for all provided Columns
        Columns is a list of column names
        """
        self.df = self.df.dropna(subset = columns, inplace=True)
        return self.df

    def split(self, column, sep, splits, col_names):
        """
        Split the provided column into splits + 1 columns named after col_names
        using the provided sep string as separator
        Col_names is a list of column names
        """
        self.df[col_names] = self.df[column].str.split(sep, splits, expand=True)
        return self.df

    def strip(self, columns, chars):
        """
        Remove the characters appearing in chars at the beginning/end of the provided columns
        Columns is a list of column names
        """
        for column in columns:
            self.df[column] = self.df[column].str.strip(chars)
        return self.df

    def remove_diacritics(self, columns):
        """
        Remove diacritics from the provided columns
        Columns is a list of column names
        """
        for column in columns:
            self.df[column] = self.df[column].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
        return self.df
        
    def set_index(self, column):
        """
        Set the provided column as index
        """
        self.df = self.df.set_index(column)
        return self.df
        
        
    def change_num_format(self, formats):
        """
        Round one ore more columns to a variable number of decimal places.
        formats is a dictionary with the column names as key and the number of decimal places as value.
        """
        self.df = self.df.round(formats)
        return self.df
        
        
    def calc_column(self, col_name, f):
        """
        Calculate the new column col_name by applying
        the function f
        """
        self.df[col_name] = self.df.apply(f, axis=1)
        return self.df
        
    def join(self, other, left_on=None, right_on=None, how='inner', **kwargs):
        """
        Joins current dataframe (left) with a new one (right).
        left_on/right_on are the keys on which perform the equijoin
        how is the type of join
        **kwargs: additional parameters
        
        The result is stored in the current dataframe.
        """
        self.df = self.df.merge(other, left_on=left_on, right_on=right_on, how=how, **kwargs)
        return self.df
        
    def groupby(self, columns, f):
        """
        Aggregate the dataframe by the provided columns
        then applied the function f on every group
        """
        return self.df.groupby(columns).agg(f)
        
    
    def categorical_encoding(self, columns):
        """
        Convert the categorical values in these columns into numerical values
        Columns is a list of column names
        """
        for column in columns:
            self.df[column] = self.df[column].astype('category')
            self.df[column] = self.df[column].cat.codes
        return self.df

    def sample_rows(self, frac, num):
        """
        Return a sample of the rows of the dataframe
        Frac is a boolean:
        - if true, num is the percentage of rows to be returned
        - if false, num is the exact number of rows to be returned
        """
        if frac:
            return self.df.sample(frac=num/100)
        else:
            return self.df.sample(n=num)

    def append(self, other, ignore_index):
        """
        Append the rows of another dataframe (other) at the end of the provided dataframe
        All columns are kept, eventually filled by nan
        Ignore index is a boolean: if true, reset row indices
        """
        self.df = self.df.append(other, ignore_index=ignore_index)
        return self.df

    def replace(self, columns, to_replace, value, regex):
        """
        Replace all occurrencies of to_replace (numeric, string, regex, list, dict) in the provided columns using the provided value
        Regex is a boolean: if true, to_replace is interpreted as a regex
        Columns is a list of column names
        """
        self.df[columns] = self.df[columns].replace(to_replace=to_replace, value=value, regex=regex)
        return self.df

    def edit(self, columns, func):
        """
        Edit the values of the cells in the provided columns using the provided expression
        Columns is a list of column names
        """
        self.df[columns] = self.df[columns].apply(func)
        return self.df

    def set_value(self, index, column, value):
        """
        Set the cell identified by index and column to the provided value
        """
        self.df.at[index, column] = value
        return self.df

    def min_max_scaling(self, columns):
        """
        Independently scale the values in each provided column in the range (0, 1)
        Columns is a list of column names
        """
        for column in columns:
            self.df[column] = self.df[column] - self.df[column].min()
            self.df[column] = self.df[column] / self.df[column].max()
            self.df[column] = self.df[column] * (max - min) + min
        return self.df

    def round(self, columns, n):
        """
        Round the values in columns using n decimal places
        Columns is a list of column names
        """
        self.df[columns] = self.df[columns].round(n)
        return self.df
        
    def get_duplicate_columns(self):
        """
        Return a list of duplicate columns, if exists.
        Duplicate columns are those which have same values for each row.
        """
        cols = self.df.columns.values
        return [(cols[i], cols[j]) for i in range(0, len(cols)) for j in range(i+1, len(cols)) if self.df[cols[i]].equals(self.df[cols[j]])]
    
    def to_csv(self, path, **kwargs):
        """
        Export the dataframe in a csv file.
        """
        self.df.to_csv(path, **kwargs)
        pass
        
    def query(self, query):
        """
        Queries the dataframe and returns the corresponding
        result set.
        :param query: a string with the query conditions, e.g. "col1 > 1 & col2 < 10"
        :return: subset of the dataframe that correspond to the selection conditions
        """
        return self.df.query(query)
    def missing_values_percent(self):
      return self.df


# EXTRACT CSV FILE

In [None]:
from pathlib import Path
import pandas as pd

path = 'https://dbgroup.ing.unimore.it/invoices/data.zip'
!wget -nc 'https://dbgroup.ing.unimore.it/invoices/data.zip'
!unzip '/content/data.zip'


#dtype={'billing_frequency': 'string','gas_offer': 'float64'}

File 'data.zip' already there; not retrieving.

Archive:  /content/data.zip
replace invoices.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

# SETTING RIGHT TYPE BY DOCUMENTATION

In [None]:
correctTypes= {
    'F1_kWh': float,
    'F2_kWh': float,
    'F3_kWh': float,
    'average_gas_bill_cost': float,
    'average_light_bill_cost': float,
    'average_unit_gas_cost': float,
    'average_unit_light_cost': float,
    'bill_id': int ,
    'bill_type': str,
    'billing_frequency': str,
    'date': 'datetime',
    'emission_date': 'datetime',
    'extra_fees': float,
    'gas_amount': float,
    'gas_average_cost': float,
    'gas_consumption': float,
    'gas_end_date': 'datetime',
    'gas_material_cost': float,
    'gas_offer': str, #float su pdf
    'gas_start_date': 'datetime',
    'gas_system_charges': float,
    'gas_transport_cost': float,
    'howmuch_pay': float,
    'light_amount': float,
    'light_average_cost': float,
    'light_consumption': float,
    'light_end_date': 'datetime',
    'light_material_cost': float,
    'light_offer': str,
    'light_offer_type': str,
    'light_start_date': 'datetime',
    'light_system_charges': float,
    'light_transport_cost': float,
    'supply_type': str,
    'total_amount': float,
    'tv': float,
}

# PIPELINE


## 0) DATA SPLIT

In [None]:
# read data
#Decomentare la prima o la seconda riga in base al tipo di enviroment 

general = dd.read_csv('/content/invoices.csv', dtype={'billing_frequency': 'string', 'gas_offer': 'string', 'city':'string'}, low_memory=False) #online
#general = dd.read_csv('invoices.csv', dtype={'billing_frequency': 'string', 'gas_offer': 'float64', 'city':'string'}, low_memory=False) #locale
utilities = general[['user_code', 'customer_code', 'city', 'address']]
customers = general[['user_code', 'nominative', 'sex', 'age']]
invoices = general.drop(labels=['user_code', 'customer_code', 'city', 'address', 'nominative', 'sex', 'age'], axis=1)
#invoicesNotDerived = invoices.drop(labels=['total_amount', 'howmuch_pay', 'light'], axis=1)
myBase = BaseDfBench(invoices)
myBase.df
#(14,15,29,30,31,32,33,36) mixed type

In [None]:
# cast data
mismatchedTypes = myBase.find_mismatched_dtypes()
mismatchedTypesCorrect = myBase.assign_custom_types(mismatchedTypes, correctTypes)
mismatchedTypesCorrect
myBase.df = myBase.cast_columns_types(mismatchedTypesCorrect)
myBase.df

In [None]:
#split into 3 dataframe
### invoicesGas
invoicesGas = invoices[invoices['supply_type']=='gas']

### invoicesLight 
invoicesLight = invoices[invoices['supply_type']=='luce']

### invoicesLight&Gas
invoicesGL = invoices[invoices['supply_type']=='gas e luce']

col_gl = invoices.columns

# delete columns
col_gas = ['F1_kWh', 'F2_kWh', 'F3_kWh', 'light_start_date', 'light_end_date', 'light_average_cost', 'light_consumption','light_offer_type', 
           'light_offer', 'light_amount', 'average_unit_light_cost', 'average_light_bill_cost', 'light_system_charges', 'light_transport_cost',
           'light_material_cost', 
           'supply_type']      

col_light = ['gas_amount', 'gas_average_cost', 'gas_start_date','gas_end_date', 'gas_consumption', 'gas_offer',  'average_unit_gas_cost',
             'average_gas_bill_cost', 'gas_system_charges',  'gas_material_cost', 'gas_transport_cost', 
             'supply_type'] 

invoicesLight = invoicesLight.drop(columns=col_light)

invoicesGas = invoicesGas.drop(columns=col_gas)

In [None]:
# take sample of data
# NO!
import numpy as np
from dask.diagnostics import ProgressBar  

with ProgressBar():
  invoicesLight_s=invoicesLight.sample(frac=5000/len(invoicesLight), replace=None, random_state=10)





In [None]:
# NO!
NSAMPLES=5000
samples = np.random.choice(invoicesLight.index, size=NSAMPLES, replace=False)
invoicesLight_s=invoicesLight.loc[samples]


In [None]:
#NO!
NSAMPLES=5000
sampled_indices = np.random.sample(range(len(invoicesLight)), NSAMPLES)
invoicesLight_s = invoicesLight[invoicesLight.index in sampled_indices]

In [None]:
# takes a lot of time to sampling data in this way
# if we don't care about the accuracy of sampling, and start with no sorted data
# we can just take first 5000 rows and it can work

NSAMPLES=5000

invoicesLight_s = invoicesLight.head(NSAMPLES)
invoicesGas_s = invoicesGas.head(NSAMPLES)
invoicesGL_s = invoicesGL.head(NSAMPLES)

In [None]:
# eventually creates istance for every dataset

myBaseL = BaseDfBench(invoicesLight)
#myBaseL.df

myBaseG = BaseDfBench(invoicesGas)
#myBaseG.df

myBaseGL = BaseDfBench(invoicesGL)
#myBaseGL.df

myBaseL_S = BaseDfBench(invoicesLight_s)
#myBaseL.df

myBaseG_S = BaseDfBench(invoicesGas_s)
#myBaseG.df

myBaseGL_S = BaseDfBench(invoicesGL_s)
#myBaseGL.df


## 1) DATA INGESTION AND DISCOVERY

### 1.1 Read data from source

In [None]:
# ci sono dei valori true e dei valori false, dovrebbe essere una stringa ma viene castata come bool
myBase.df['bill_type'].unique().compute() #

0     True
1    False
Name: bill_type, dtype: bool

In [None]:
myBase.df.describe().compute()

Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,tv,gas_amount,gas_average_cost,light_average_cost,extra_fees,gas_consumption,light_consumption,light_offer,howmuch_pay,total_amount,light_amount,average_unit_light_cost,average_light_bill_cost,average_unit_gas_cost,average_gas_bill_cost,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
count,10497140.0,10496980.0,10497050.0,10496450.0,10497140.0,6810389.0,6209657.0,4529680.0,10497140.0,6810390.0,4810326.0,10497140.0,10497140.0,10497140.0,4810339.0,4529729.0,4530024.0,6209657.0,6211363.0,6734656.0,4755502.0,6741981.0,4756329.0,6735368.0,4756322.0
mean,1043021.0,59.09597,48.92921,53.17689,5.308814,159.1596,0.7418461,48.8066,4.326666,164.3498,381.0993,80626520000.0,161.2744,153.8392,100.9318,0.2953825,0.7787436,0.7418461,1.552363,4.78576,20.2711,63.99663,17.29069,37.48406,48.69274
std,686231.0,1285.331,945.9065,1475.954,7.998176,354.1522,1.934005,16.57033,180.2168,186.9755,2377.521,6.041521e+18,293.77,309.8192,203.887,1.771076,4.856549,1.934005,3.69577,19.46054,50.62049,123.5472,35.90962,67.05752,88.04623
min,0.0,-992349.0,-990054.0,-987312.0,-478.0,-66102.43,0.0,0.0,-86090.44,-999.0,-284395.0,2.557844e+17,-69.86,-84813.11,-49799.01,0.0,0.0,0.0,0.0,-3449.81,-21815.06,-18918.76,-20018.29,-9773.77,-19594.04
25%,615854.5,0.0,0.0,0.0,0.0,75.13,0.32,48.0,0.0,67.0,187.0,1.436218e+19,87.29,81.7,63.9025,0.12,0.24,0.32,0.8,-0.87,11.91,31.93,15.12,22.69,29.98
50%,1308250.0,24.0,20.0,26.0,0.0,139.645,0.44,52.0,0.48,156.0,374.0,1.844674e+19,137.33,129.31,94.785,0.15,0.28,0.44,0.97,4.805,19.52,56.6,17.53,34.81,45.91
75%,2498177.0,142.0,127.0,159.0,18.0,337.595,0.78,56.0,0.67,446.0,586.0,1.844674e+19,278.735,276.515,142.0875,0.19,0.38,0.78,1.58,19.0,29.14,135.73,24.33,64.19,70.09
max,2531054.0,2141161.0,1285834.0,2552774.0,198.0,387143.3,880.44,6785.0,128083.2,999.0,2943120.0,1.844674e+19,308001.5,308001.5,151592.8,2691.67,5958.33,880.44,2379.34,16843.44,38770.83,122542.5,29985.69,54356.24,61903.19


In [None]:
myBase.df['gas_amount'].describe().compute()

count    6.810389e+06
mean     1.591596e+02
std      3.541522e+02
min     -6.610243e+04
25%      7.513000e+01
50%      1.396450e+02
75%      3.375950e+02
max      3.871433e+05
Name: gas_amount, dtype: float64

### 1.2 Study how to deal with data that doesn’t fit in memory (!)

### 1.3 Locate missing values


In [None]:
from dask.diagnostics import ProgressBar  
missing_values = myBase.df.isna().sum()

with ProgressBar():
  percent_missing = ((missing_values / myBase.df.index.size) * 100).compute()
percent_missing

[########################################] | 100% Completed |  1min 47.1s


bill_id                     0.000000
F1_kWh                      0.001581
F2_kWh                      0.000876
F3_kWh                      0.006564
date                        0.074982
light_start_date            0.074982
light_end_date              0.074982
tv                          0.000000
gas_amount                 35.121499
gas_average_cost           40.844314
light_average_cost         56.848449
emission_date               0.000000
supply_type                 0.000000
gas_start_date              0.074982
gas_end_date                0.074982
extra_fees                  0.000000
gas_consumption            35.121490
light_consumption          54.174903
gas_offer                   0.000000
light_offer_type            0.000000
light_offer                 0.000000
howmuch_pay                 0.000000
total_amount                0.000000
light_amount               54.174779
average_unit_light_cost    56.847982
average_light_bill_cost    56.845172
average_unit_gas_cost      40.844314
a

### 1.4 Locate outliers

In [None]:
 
#too long to compute
#import seaborn as sns
# with ProgressBar():
#  sns.pairplot(myBase.df[['gas_amount', 'total_amount']].compute())


In [None]:
# using locate_outliers function
with ProgressBar():
  outliers = myBase.locate_outliers(column='gas_amount', lower_quantile=0.1, upper_quantile=0.99)

  outliers.head()

## 2) DATA VALIDATION


### 2.3 Find mismatched types

In [None]:
mismatchedTypes = myBase.find_mismatched_dtypes()
mismatchedTypesCorrect = myBase.assign_custom_types(mismatchedTypes, correctTypes)
mismatchedTypesCorrect
myBase.df = myBase.cast_columns_types(mismatchedTypesCorrect)
myBase.df


  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)


Unnamed: 0_level_0,bill_id,F1_kWh,F2_kWh,F3_kWh,date,light_start_date,light_end_date,tv,gas_amount,gas_average_cost,light_average_cost,emission_date,supply_type,gas_start_date,gas_end_date,extra_fees,gas_consumption,light_consumption,gas_offer,light_offer_type,light_offer,howmuch_pay,total_amount,light_amount,average_unit_light_cost,average_light_bill_cost,average_unit_gas_cost,average_gas_bill_cost,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
npartitions=79,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
,int64,float64,float64,float64,datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,float64,float64,datetime64[ns],object,datetime64[ns],datetime64[ns],float64,float64,float64,float64,object,uint64,float64,float64,float64,float64,float64,float64,float64,string,bool,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [None]:
myBase.df.head()

  out = getattr(getattr(obj, accessor, obj), attr)(*args, **kwargs)


Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,date,light_start_date,light_end_date,tv,gas_amount,gas_average_cost,...,average_unit_gas_cost,average_gas_bill_cost,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
0,0,0.0,0.0,0.0,2018-04-16,2018-04-16,2019-11-25,0.0,-0.48,,...,,,,True,-0.06,,0.06,,-0.2,
1,1,81.0,62.0,76.0,2020-12-05,2020-12-05,2020-12-31,0.0,,,...,,,,True,,9.15,,5.68,,23.13
2,2,0.0,0.0,0.0,2020-12-05,2020-12-05,2020-12-31,0.0,-21.79,,...,,,,True,1.24,,-10.13,,-7.13,
3,3,0.0,0.0,0.0,2020-10-03,2020-10-03,2020-12-31,0.0,58.51,0.44,...,0.44,1.08,,True,-0.88,,23.68,,17.66,
4,4,0.0,0.0,0.0,2020-12-16,2020-12-16,2020-12-31,0.0,383.66,0.32,...,0.32,0.86,,True,14.71,,141.57,,63.59,


## 3) DATA STRUCTURING


### 3.3 Delete, split or merge columns

In [None]:
# INVOICES GAS SORTED

with ProgressBar():
  invoicesGasSorted = invoicesGas.compute().sort_values('bill_id')
invoicesGasSorted

[########################################] | 100% Completed |  8min 15.8s


Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,date,light_start_date,light_end_date,tv,gas_amount,gas_average_cost,light_average_cost,emission_date,supply_type,gas_start_date,gas_end_date,extra_fees,gas_consumption,light_consumption,gas_offer,light_offer_type,light_offer,howmuch_pay,total_amount,light_amount,average_unit_light_cost,average_light_bill_cost,average_unit_gas_cost,average_gas_bill_cost,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
0,0,0.0,0.0,0.0,2018-04-16,2018-04-16,2019-11-25,0.0,-0.48,,,2020-12-31,gas,2019-11-25,2018-04-16,0.48,-2.0,,4255330384700204909,light,18446744073709551615,0.00,0.00,,,,,,,True,-0.06,,0.06,,-0.20,
88001,0,0.0,0.0,0.0,2021-02-03,2021-02-03,2021-03-30,0.0,11.49,,,2021-03-30,gas,2021-03-30,2021-02-03,0.55,-3.0,,4255330384700204909,light,18446744073709551615,0.00,12.04,,,,,,bimester,False,-2.26,,6.24,,5.53,
88002,1,0.0,0.0,0.0,2021-02-03,2021-02-03,2021-03-30,0.0,217.24,0.45,,2021-03-30,gas,2021-03-30,2021-02-03,0.55,243.0,,17683241029697148327,light,18446744073709551615,217.79,217.79,,,,0.45,0.89,bimester,False,14.13,,109.18,,30.28,
88003,2,0.0,0.0,0.0,2021-02-06,2021-02-06,2021-03-30,0.0,32.33,,,2021-03-30,gas,2021-03-30,2021-02-06,0.00,0.0,,16421179386027010504,light,18446744073709551615,32.33,32.33,,,,,,,False,-0.64,,4.73,,22.41,
84868,3,0.0,0.0,0.0,2021-04-14,2021-04-14,2021-05-27,0.0,168.03,0.23,,2021-05-27,gas,2021-05-27,2021-04-14,0.00,222.0,,3354194450249482832,light,18446744073709551615,168.03,168.03,,,,0.23,0.76,bimester,True,6.34,,50.30,,37.61,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86501,2529555,0.0,0.0,0.0,2021-02-01,2021-02-01,2021-03-26,0.0,72.90,0.46,,2021-03-26,gas,2021-03-26,2021-02-01,30.00,90.0,,4426041636325184338,light,18446744073709551615,102.90,102.90,,,,0.46,0.81,quarterly,False,-1.63,,41.17,,17.49,
86502,2529556,0.0,0.0,0.0,2020-12-04,2020-12-04,2021-03-26,0.0,176.87,0.46,,2021-03-26,gas,2021-03-26,2020-12-04,0.63,204.0,,15614206351661483855,light,18446744073709551615,177.50,177.50,,,,0.46,0.87,quarterly,False,0.80,,92.98,,34.63,
86503,2529557,0.0,0.0,0.0,2020-12-04,2020-12-04,2021-03-26,0.0,160.68,0.42,,2021-03-26,gas,2021-03-26,2020-12-04,0.80,198.0,,15614206351661483855,light,18446744073709551615,161.48,161.48,,,,0.42,0.81,quarterly,False,1.30,,82.70,,34.18,
86504,2529558,0.0,0.0,0.0,2020-12-04,2020-12-04,2021-03-26,0.0,118.41,0.55,,2021-03-26,gas,2021-03-26,2020-12-04,0.10,120.0,,4426041636325184338,light,18446744073709551615,118.51,118.51,,,,0.55,0.99,quarterly,False,-1.92,,65.81,,25.82,


In [None]:
myBaseGas = BaseDfBench(invoicesGas)
myBaseGas.df[myBaseGas.df['gas_end_date']>myBaseGas.df['gas_start_date']]

Unnamed: 0_level_0,bill_id,F1_kWh,F2_kWh,F3_kWh,date,light_start_date,light_end_date,tv,gas_amount,gas_average_cost,light_average_cost,emission_date,supply_type,gas_start_date,gas_end_date,extra_fees,gas_consumption,light_consumption,gas_offer,light_offer_type,light_offer,howmuch_pay,total_amount,light_amount,average_unit_light_cost,average_light_bill_cost,average_unit_gas_cost,average_gas_bill_cost,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
npartitions=80,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
,int64,float64,float64,float64,datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,float64,float64,datetime64[ns],object,datetime64[ns],datetime64[ns],float64,float64,float64,string,object,uint64,float64,float64,float64,float64,float64,float64,float64,string,bool,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [None]:
percent_missing[percent_missing == 100].index

Index(['light_average_cost', 'light_consumption', 'light_amount',
       'average_unit_light_cost', 'average_light_bill_cost',
       'light_system_charges', 'light_transport_cost', 'light_material_cost'],
      dtype='object')

In [None]:
myBaseGas.df = myBaseGas.df.drop(labels=['light_average_cost', 'light_consumption', 'light_amount',
       'average_unit_light_cost', 'average_light_bill_cost',
       'light_system_charges', 'light_transport_cost', 'light_material_cost'], axis=1)

In [None]:
myBaseGas.df

Unnamed: 0_level_0,bill_id,F1_kWh,F2_kWh,F3_kWh,date,light_start_date,light_end_date,tv,gas_amount,gas_average_cost,emission_date,supply_type,gas_start_date,gas_end_date,extra_fees,gas_consumption,gas_offer,light_offer_type,light_offer,howmuch_pay,total_amount,average_unit_gas_cost,average_gas_bill_cost,billing_frequency,bill_type,gas_system_charges,gas_material_cost,gas_transport_cost
npartitions=80,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
,int64,float64,float64,float64,datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,float64,datetime64[ns],object,datetime64[ns],datetime64[ns],float64,float64,string,object,uint64,float64,float64,float64,float64,string,bool,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [None]:
from dask.diagnostics import ProgressBar  
missing_values = myBaseGas.df.isna().sum()

with ProgressBar():
  percent_missing = ((missing_values / myBaseGas.df.index.size) * 100)
percent_missing

bill_id                  0.000000
F1_kWh                   0.000000
F2_kWh                   0.000000
F3_kWh                   0.000000
date                     0.089127
light_start_date         0.089127
light_end_date           0.089127
tv                       0.000000
gas_amount               0.000000
gas_average_cost         8.591053
emission_date            0.000000
supply_type              0.000000
gas_start_date           0.089127
gas_end_date             0.089127
extra_fees               0.000000
gas_consumption          0.000000
gas_offer                0.000000
light_offer_type         0.000000
light_offer              0.000000
howmuch_pay              0.000000
total_amount             0.000000
average_unit_gas_cost    8.591053
average_gas_bill_cost    8.577107
billing_frequency        7.806147
bill_type                0.000000
gas_system_charges       0.664370
gas_material_cost        0.572447
gas_transport_cost       0.653168
dtype: float64

In [None]:
myBaseGas.df[(myBaseGas.df['average_unit_gas_cost'] != myBaseGas.df['gas_average_cost']) & myBaseGas.df['gas_average_cost'].isna()]

Unnamed: 0,bill_id,F1_kWh,F2_kWh,F3_kWh,date,light_start_date,light_end_date,tv,gas_amount,gas_average_cost,emission_date,supply_type,gas_start_date,gas_end_date,extra_fees,gas_consumption,gas_offer,light_offer_type,light_offer,howmuch_pay,total_amount,average_unit_gas_cost,average_gas_bill_cost,billing_frequency,bill_type,gas_system_charges,gas_material_cost,gas_transport_cost
0,0,0.0,0.0,0.0,2018-04-16,2018-04-16,2019-11-25,0.0,-0.48,,2020-12-31,gas,2019-11-25,2018-04-16,0.48,-2.0,4255330384700204909,light,18446744073709551615,0.00,0.00,,,,True,-0.06,0.06,-0.20
88001,0,0.0,0.0,0.0,2021-02-03,2021-02-03,2021-03-30,0.0,11.49,,2021-03-30,gas,2021-03-30,2021-02-03,0.55,-3.0,4255330384700204909,light,18446744073709551615,0.00,12.04,,,bimester,False,-2.26,6.24,5.53
88003,2,0.0,0.0,0.0,2021-02-06,2021-02-06,2021-03-30,0.0,32.33,,2021-03-30,gas,2021-03-30,2021-02-06,0.00,0.0,16421179386027010504,light,18446744073709551615,32.33,32.33,,,,False,-0.64,4.73,22.41
88009,8,0.0,0.0,0.0,2020-07-02,2020-07-02,2020-07-02,0.0,0.00,,2021-03-30,gas,2020-07-02,2020-07-02,0.00,0.0,16421179386027010504,light,18446744073709551615,0.00,0.00,,,,False,,,
88010,9,0.0,0.0,0.0,2019-12-31,2019-12-31,2019-12-31,0.0,0.00,,2021-03-30,gas,2019-12-31,2019-12-31,0.00,0.0,6166237371541002459,light,18446744073709551615,0.00,0.00,,,,False,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
86475,2529529,0.0,0.0,0.0,2020-12-04,2020-12-04,2021-03-26,0.0,32.54,,2021-03-26,gas,2021-03-26,2020-12-04,0.59,-7.0,13506450504303769484,light,18446744073709551615,33.13,33.13,,,quarterly,False,-6.70,19.11,14.39
86478,2529532,0.0,0.0,0.0,2021-02-10,2021-02-10,2021-03-26,0.0,18.39,,2021-03-26,gas,2021-03-26,2021-02-10,1.91,0.0,7893399263570046800,light,18446744073709551615,20.30,20.30,,,quarterly,False,-2.18,8.08,9.17
86481,2529535,0.0,0.0,0.0,2020-12-04,2020-12-04,2021-03-26,0.0,75.22,,2021-03-26,gas,2021-03-26,2020-12-04,1.52,-4.0,14283211135448165385,light,18446744073709551615,76.74,76.74,,,quarterly,False,-6.68,41.03,27.39
86485,2529539,0.0,0.0,0.0,2020-12-04,2020-12-04,2021-03-26,0.0,31.57,,2021-03-26,gas,2021-03-26,2020-12-04,0.58,-10.0,4255330384700204909,light,18446744073709551615,32.15,32.15,,,quarterly,False,-6.77,18.52,14.25


In [None]:
provaGas = myBase.df.head(10)


In [None]:
provaGasComputed = provaGas[provaGas['gas_average_cost'].notna()][['average_gas_bill_cost', 'gas_amount', 'gas_consumption', 'average_unit_gas_cost', 'gas_system_charges', 'gas_transport_cost', 'gas_material_cost', 'extra_fees']]
#provaGas['check'] = provaGas[(myBaseGas.df['gas_system_charges'] + provaGas['gas_transport_cost']) == provaGas['gas_amount']]
provaGasComputed

Unnamed: 0,average_gas_bill_cost,gas_amount,gas_consumption,average_unit_gas_cost,gas_system_charges,gas_transport_cost,gas_material_cost,extra_fees
3,1.08,58.51,54.0,0.44,-0.88,17.66,23.68,0.02
4,0.86,383.66,447.0,0.32,14.71,63.59,141.57,0.0
5,0.86,386.63,448.0,0.37,16.97,44.45,167.54,-188.86
6,0.65,15.69,24.0,0.22,1.16,2.4,5.22,7.9
7,1.34,44.17,33.0,0.82,-3.36,11.32,27.19,0.0
9,3.6,79.14,22.0,2.33,-10.78,24.2,51.36,0.0


In [None]:
myBase.cast_columns_types(mismatchedTypesCorrect)

Unnamed: 0_level_0,bill_id,F1_kWh,F2_kWh,F3_kWh,date,light_start_date,light_end_date,tv,gas_amount,gas_average_cost,light_average_cost,emission_date,supply_type,gas_start_date,gas_end_date,extra_fees,gas_consumption,light_consumption,gas_offer,light_offer_type,light_offer,howmuch_pay,total_amount,light_amount,average_unit_light_cost,average_light_bill_cost,average_unit_gas_cost,average_gas_bill_cost,billing_frequency,bill_type,gas_system_charges,light_system_charges,gas_material_cost,light_transport_cost,gas_transport_cost,light_material_cost
npartitions=80,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1
,int64,float64,float64,float64,datetime64[ns],datetime64[ns],datetime64[ns],float64,float64,float64,float64,datetime64[ns],object,datetime64[ns],datetime64[ns],float64,float64,float64,float64,object,uint64,float64,float64,float64,float64,float64,float64,float64,string,bool,float64,float64,float64,float64,float64,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [None]:
#Questa colonna possiede solo questi 3 valori, converrebbe fare una one_hot_encoding o una mappatura dei valori 0:gas, 1:luce, 2:gas e luce (?)

myBase.df['supply_type'].drop_duplicates().compute()

0           gas
1          luce
2    gas e luce
Name: supply_type, dtype: object

In [None]:
myBase.df.total_amount[myBase.df.total_amount == myBase.df.light_amount + myBase.df.gas_amount + myBase.df.extra_fees]

In [None]:
# Costo non negativo | chiedo a colloquio
# fillna(0) sulle colonen dei costi
# localizzare outliers o con metodo grafico o con min e max
# sorting ? chiedo a colloqui
# condizioni sulle colonne
# analizzare bill_id e user_id per il merge
# con cosa decodificare 
# come creare una reference