In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import geopandas as gpd
import folium
import seaborn as sns
from tabulate import tabulate
import os

# Get parent directory i.e. where the repo is cloned from there we can access the data folder
base_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))+'/'

df = pd.read_pickle(base_dir+'0_datasets/REC_database_optimized_cleaned.pkl')

pd.set_option('display.max_rows', None)

#check pc specs for multiprocessing
import multiprocessing
from psutil import virtual_memory
print("CPU cores: ",multiprocessing.cpu_count())
print("RAM: ",round(virtual_memory().total / 1e9,0),"GB")

CPU cores:  8
RAM:  34.0 GB


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7190252 entries, 0 to 7190251
Data columns (total 17 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   Owner                        category      
 1   Accreditation_Code           string        
 2   Fuel_Source_Active           bool          
 3   Fuel_Source_Name             category      
 4   Generation_Year              category      
 5   Status                       category      
 6   Start_Serial                 int32         
 7   End_Serial                   int32         
 8   State                        category      
 9   Owner_Name                   category      
 10  Range_ID                     int32         
 11  Public_Registered_Person_ID  int32         
 12  Created_By                   category      
 13  Creation_Date                datetime64[ns]
 14  Certificates_Quantity        int32         
 15  Dominent_Zone                category      
 16  

In [3]:
# check categories and their frquencies for all categorical variables
categorical_variables_list = []
for col in df.columns:
    if df[col].dtype == 'category':
        categorical_variables_list.append(col)

for i, value in enumerate(categorical_variables_list):  
    output = df[categorical_variables_list[i]].value_counts(normalize=False).to_frame().reset_index()
    print(value,'\n', tabulate(output.head(50), floatfmt='.0f'))

Owner 
 --  ----------------------------------------------------------------  ------
 0  Origin Energy Electricity Limited                                 961698
 1  The Clean Energy Regulator                                        544697
 2  EnergyAustralia Pty Ltd                                           442276
 3  AGL Sales Pty Limited                                             425786
 4  Stanwell Corporation Limited                                      268704
 5  Ergon Energy Queensland Pty Ltd                                   211357
 6  ERM Power Retail Pty Ltd                                          199882
 7  AGL HP1 Pty Limited, AGL HP2 Pty Limited and AGL HP3 Pty Limited  185165
 8  Aurora Energy Pty Ltd                                             179996
 9  Australia and New Zealand Banking Group Limited                   166227
10  SHELL ENERGY RETAIL PTY LTD                                       158821
11  Ausgrid                                                         

In [4]:
df.head(50)

Unnamed: 0,Owner,Accreditation_Code,Fuel_Source_Active,Fuel_Source_Name,Generation_Year,Status,Start_Serial,End_Serial,State,Owner_Name,Range_ID,Public_Registered_Person_ID,Created_By,Creation_Date,Certificates_Quantity,Dominent_Zone,System_Size
0,Ashley Noon T/A Green Wiring,PVD2393168,True,S.G.U. - solar (deemed),2015,Registered,1,105,WA,Ashley Noon T/A Green Wiring,5404590,10412,Ashley Noon T/A Green Wiring,2016-01-23,105,3,5.1
1,Solargain PV Pty Ltd,PVD2393190,True,S.G.U. - solar (deemed),2015,Invalid due to audit,1,64,ACT,Solargain PV Pty Ltd,5404628,10894,Solargain PV Pty Ltd,2016-01-24,64,3,3.1
2,Home Comfort and Sustainability Services Pty L...,PVD2393171,True,S.G.U. - solar (deemed),2015,Invalid due to audit,1,126,QLD,Home Comfort and Sustainability Services Pty L...,5404633,18516,Home Comfort and Sustainability Services Pty L...,2016-01-24,126,3,6.1
3,Home Comfort and Sustainability Services Pty L...,PVD2393172,True,S.G.U. - solar (deemed),2015,Invalid due to audit,1,417,QLD,Home Comfort and Sustainability Services Pty L...,5404634,18516,Home Comfort and Sustainability Services Pty L...,2016-01-24,417,3,20.1
4,Michael patterson,PVD2393173,True,S.G.U. - solar (deemed),2015,Invalid due to audit,1,53,NSW,Michael patterson,5404635,11991,Michael patterson,2016-01-24,53,3,2.6
5,Solargain PV Pty Ltd,PVD2393200,True,S.G.U. - solar (deemed),2015,Invalid due to audit,1,819,WA,Solargain PV Pty Ltd,5404644,10894,Solargain PV Pty Ltd,2016-01-24,819,3,39.5
6,Advance Finance Solutions Pty Ltd,PVD2393208,True,S.G.U. - solar (deemed),2016,Invalid due to audit,1,310,WA,Advance Finance Solutions Pty Ltd,5404732,19047,Advance Finance Solutions Pty Ltd,2016-01-24,310,3,15.0
7,Isquith Pty Ltd T/A Solar Link,PVD2393444,True,S.G.U. - solar (deemed),2015,Registered,1,88,VIC,Isquith Pty Ltd T/A Solar Link,5404748,13072,Isquith Pty Ltd T/A Solar Link,2016-01-25,88,4,5.0
8,Chromagen Australia Pty Ltd,SW2393250,True,S.W.H. - solar (deemed),2015,Invalid due to audit,1,41,QLD,Chromagen Australia Pty Ltd,5404790,20507,Chromagen Australia Pty Ltd,2016-01-24,41,3,0.0
9,C S Energy Limited,WCMGQL01,True,Waste coal mine gas,2013,Registered,124968,124989,QLD,C S Energy Limited,5404811,17924,EDL CSM (QLD) Pty Ltd,2013-07-29,22,3,0.0
