In [1]:
import pandas as pd
import numpy as np
import time
import datetime
import os
import requests
from bs4 import BeautifulSoup as bs
import re
from urllib.parse import unquote
# import PyPDF2 as p2
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

### read csv master file with Project Name and link to ESA folder(s) that contain the ESA files
### 85 rows, because some projects have multiple folder (which contain the ESA documents)  

In [2]:
path_to_master_file = 'F:/Environmental Baseline Data/Version 4 - Final/Support files/list_of_Section_52_projects.csv'
master = pd.read_csv(path_to_master_file, encoding='ISO-8859-1')
master['folder_link'] = master['link to folder with ESA'].str.replace('/View/', '/LoadResult/')
print('shape of csv:', master.shape)
print('numer of unique project:', master['Application title'].nunique())
# master.head()
# master['Application title'].unique()

shape of csv: (88, 7)
numer of unique project: 38


### get all RegDocs files

In [3]:
# %%time Wall time: 32.9 s

all_esa = []
list_applications = master['Application title'].tolist()
list_downloads = master['folder_link'] 

for a, b in zip(list_downloads, list_applications):
    cookies = {'RDI-NumberOfRecords': '200'}
    r = requests.get(a, cookies = cookies)
    soup = bs(r.text, 'lxml')
    tds = soup.find_all('td', {'class' : 'nopadding break-anywhere'})
    file_name = [x.text for x in soup.find_all('a', {'class' : 'row-icon'})]
    hrefs = [a['href'] for a in soup.find_all('a', {'class' : 'row-icon'})]
    df = pd.DataFrame({'file_name' : file_name, 'hrefs' : hrefs, 'application_name' : b, 'esa_folder_link' : a})
    all_esa.append(df)

In [4]:
# create download links by going to each folder and identify the ESA files (using regular expressions) 
df_esa = pd.concat(all_esa, axis = 0)
regex_esa = 'COVER|NV|EPP|EIS|ESA|nvironment|Horn|Vantage|7212_App|ocio|EIA|App 13|V11|V14|V15|V16|V17|V18|V19|V20|V21|V22|V23|V24|V25|A1X1|A1C3|A3S1|A3S2|A0X|Goldboro'
df_esa = df_esa[df_esa['file_name'].str.contains(regex_esa)]
df_esa['DocumentID'] = df_esa['file_name'].str.extract('([a-zA-Z]\d[a-zA-Z]\d[a-zA-Z]\d)')
df_esa['DataID'] = df_esa['hrefs'].str.replace('/REGDOCS/File/Download/', '')
df_esa['esa_download_link'] = df_esa['hrefs'].str.replace('/REGDOCS/', 'https://apps.cer-rec.gc.ca/REGDOCS/')
df_esa = df_esa.drop('hrefs', axis = 1)
df_esa['DataID_pdf'] = df_esa['DataID'] + '.pdf'

# remove receipts
df_esa = df_esa[~df_esa['file_name'].str.contains('eceipt')]
df_esa.head(2)
df_esa['application_name'].nunique()
df_esa = df_esa.drop_duplicates()
df_esa['DataID'].nunique()
df_esa.shape
# df_esa.to_csv('test.csv')

Unnamed: 0,file_name,application_name,esa_folder_link,DocumentID,DataID,esa_download_link,DataID_pdf
5,A0H8C0 - 13.0 EIA - Section 13.1 to 13.6,2003-03-17 Application to Construct and Operat...,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C0,268706,https://apps.cer-rec.gc.ca/REGDOCS/File/Downlo...,268706.pdf
6,A0H8C1 - 13.0 EIA - Section 13.7 Wildlife Part 1,2003-03-17 Application to Construct and Operat...,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C1,268709,https://apps.cer-rec.gc.ca/REGDOCS/File/Downlo...,268709.pdf


38

1994

(1994, 7)

In [5]:
#delete these files
# B-1S  - Section 12 - Environmental and Socio-Economic Matters  - A2F4K6
# B-1T - Horn_River_Komie_North_Overview_Map_2011-09-29  - A2F4K7 
# B-1-g -- A0U3G2 - Environmental and Socio-Economic Assessment - Volume 2 of 2 (Paper Only)
# B1-15 - 12.0 Environmental and Socio-Economic - A2A6Q1



In [6]:
# %%time
# # downloads all files
# # ****************************************************************** 1 hour execution 

# download_link = df_esa['esa_download_link'].tolist()
# DataID = df_esa['DataID'].tolist()
# save_folder = 'F:/Environmental Baseline Data/Version 4 - Final/PDF/'

# for x, y in zip(download_link, DataID):
#     try:
#         download_url = 'http://docs2.cer-rec.gc.ca/ll-eng/llisapi.dll?func=ll&objId=' + y + '&objaction=download&viewType=1'
#         r = requests.get(download_url)
#         full_name = os.path.join(save_folder, (y +'.pdf')) 
#         with open(full_name, 'wb') as file:
#             file.write(r.content) 
#     except:
#         print(x, y)


In [7]:
# download all files per hearing
# *******************************************************************
# per_project = df_esa[df_esa['application_name'] == '2010-02-19 - Application for the Horn River Project (GH-2-2010)']

# download_link = per_project['esa_download_link'].tolist()
# DataID = per_project['DataID'].tolist()
# save_folder = 'F:/Environmental Baseline Data/Version 4 - Final/PDF/'
# len(download_link)
# len(DataID)

# for x, y in zip(download_link, DataID):
#     download_url = 'http://docs2.cer-rec.gc.ca/ll-eng/llisapi.dll?func=ll&objId=' + y + '&objaction=download&viewType=1'
#     r = requests.get(download_url)
#     full_name = os.path.join(save_folder, (y +'.pdf')) 
#     with open(full_name, 'wb') as file:
#         file.write(r.content) 

## load GIS info and merge

In [8]:
gis_df = pd.read_excel('F:/Environmental Baseline Data/Version 4 - Final/Indices/Index 1 - List of Major Projects with ESAs.xlsx')
gis_df['Application title short'].nunique()
gis_df.shape
df_esa_with_gis = df_esa.merge(gis_df, on = 'application_name', how = 'left')
df_esa_with_gis = df_esa_with_gis.drop(columns = 'Location')
df_esa_with_gis.shape
df_esa_with_gis['Application title short'].nunique()

38

(38, 17)

(1994, 22)

38

## load component info and merge

In [9]:
esa_component_path = 'F:/Environmental Baseline Data/Version 4 - Final/Support files/Appendices mapped with files per Project.xlsx'
esa_component_names = pd.read_excel(esa_component_path, sheet_name = 'new')
esa_component_names['Application title short'].nunique()

38

In [10]:
#load component and merge 
df_esa_with_gis_components = df_esa_with_gis.merge(esa_component_names, on = 'file_name', how = 'left')
df_esa_with_gis_components = df_esa_with_gis_components.drop('Application title short_y', axis = 1)
df_esa_with_gis_components = df_esa_with_gis_components.rename(columns = {'Application title short_x' : 'Application title short'})
 
df_esa_with_gis_components.shape
df_esa_with_gis_components['application_name'].nunique()

(1994, 25)

38

## load short_names and merge with regdocs metadata

In [11]:
short_names = pd.read_csv('F:/Environmental Baseline Data/Version 4 - Final/Support files/short_names_update.csv')
short_names['application_name'].nunique()
df_esa_with_gis_components_short_names = df_esa_with_gis_components.merge(short_names, on = 'application_name', how = 'left')
df_esa_with_gis_components_short_names.shape
# df_esa_with_gis_components_short_names['Application title short'].unique()
df_esa_with_gis_components_short_names.to_csv('with_keystone.csv', encoding = 'utf_8_sig', index = False)
# df_esa_with_gis_components_short_names.head()
# df_esa_with_gis_components_short_names.tail()

38

(1994, 29)

## load PDF Section Topics and merge with regdocs metadata

In [13]:
df_topics = pd.read_csv("F:\Environmental Baseline Data\Version 4 - Final\Support files\Table_Title_Topics_for_Brooke.csv",\
                        usecols = ['file_name', 'topics'])
df_topics['topics'] = df_topics['topics'].str.replace("\[\]", 'Other, All')
df_topics['topics'] = df_topics['topics'].str.replace("'", '').str.replace('[', '').str.replace(']', ', All')
df_topics['topics'] = df_topics['topics'].str.title()
# df_topics['topics'].unique()
# df_topics['topics'].nunique()
# df_topics.columns
# df_topics.head()

df_esa_with_gis_components_short_names_topics = df_esa_with_gis_components_short_names.merge(df_topics, on = 'file_name', how = 'left') 
df_esa_with_gis_components_short_names_topics.shape
df_esa_with_gis_components_short_names_topics['application_name'].nunique()


(1994, 30)

38

## load PDF size, page count, outline and merge
=REGDOCS metadata

In [14]:
pdf_size_df = pd.read_csv('F:/Environmental Baseline Data/Version 4 - Final/PDF-size-numpages-outlineboolean/PDF-size-numpages-outlineboolean.csv')
#May 4: file above needs to be updated

pdf_size_df = pdf_size_df.drop('Unnamed: 0', axis = 1)
pdf_size_df['file_name'] = pdf_size_df['file_name'].str.replace('F:/Environmental Baseline Data/Version 4 - Final/PDF/', '')
pdf_size_df['file_name'] = pdf_size_df['file_name'].str.replace('.pdf', '')
pdf_size_df = pdf_size_df.rename(columns = {'file_name' : 'DataID'})

In [15]:
df_esa_with_gis_components_short_names_topics_page_count = df_esa_with_gis_components_short_names_topics.merge(pdf_size_df, on = 'DataID', how = 'left')
df_esa_with_gis_components_short_names_topics_page_count.shape
# df_esa_with_gis_components_short_names_topics_page_count.columns
# df_esa_with_gis_components_short_names_topics_page_count.head(2)
df_esa_with_gis_components_short_names_topics_page_count['Application title short'].nunique()

(1994, 33)

38

## Load Tables data and merge with Regdocs

In [16]:
df_table_titles = pd.read_csv('F:/Environmental Baseline Data/Version 4 - Final/Saved2/all_tables-final.csv')
# From Janna 4 May 2020
# F:\Environmental Baseline Data\Version 4 - Final\Saved2\all_tables-final.csv
# F:\Environmental Baseline Data\Version 4 - Final\Saved2\final_figs_pivoted_new.csv

df_table_titles.shape
df_table_titles = df_table_titles.dropna(subset = ['titleFinal']) #drop those with no title
df_table_titles.shape
df_table_titles = df_table_titles[['titleFinal', 'pdfId', 'page', 'tableNumber']]
df_table_titles = df_table_titles.rename(columns = {'titleFinal' : 'Title', 'pdfId' : 'DataID', \
                                                    'page' : 'Page Number', 'tableNumber' : 'Table Number'})
df_table_titles['Category'] = 'Table'
df_table_titles['DataID'] = df_table_titles['DataID'].astype(str)
df_table_titles.head()


(44629, 8)

(28891, 8)

Unnamed: 0,Title,DataID,Page Number,Table Number,Category
0,TABLE 3 SUMMARY OF AQUATICS FIELD WORK AND ABO...,1059614,14,1,Table
1,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,1059614,17,1,Table
2,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,1059614,18,1,Table
3,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,1059614,19,1,Table
4,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG...,1059614,20,1,Table


In [43]:
tables_df_regdocs = df_table_titles.merge(df_esa_with_gis_components_short_names_topics_page_count,
                                           on = ['DataID'], how = 'left')
tables_df_regdocs.shape
tables_df_regdocs['Title'].nunique()
tables_df_regdocs['application_name'].nunique()

(28891, 37)

14459

38

## Load Figures data and merge with Regdocs

In [44]:
figs = pd.read_csv('F:/Environmental Baseline Data/Version 4 - Final/Support files/figures_for_tableau.csv')
figs['location_Page'] = pd.to_numeric(figs['location_Page'].fillna(0), errors='ignore', downcast = 'integer')
figs['location_DataID'] = pd.to_numeric(figs['location_DataID'].fillna(0), errors='ignore', downcast = 'integer')
figs.columns = ['Title', 'short_name', 'DataID', 'Page Number']
figs = figs.drop('short_name', axis = 1)
figs['Category'] = 'Figure'
figs['DataID'] = figs['DataID'].astype(str)
figs.shape
figs.isnull().sum()

figures_df_regdocs = figs.merge(df_esa_with_gis_components_short_names_topics_page_count,
                                           on = ['DataID'], how = 'left')

figures_df_regdocs= figures_df_regdocs[~figures_df_regdocs['file_name'].isnull()] # May 4th - why ??????

figures_df_regdocs.shape
# figures_df_regdocs.isnull().sum()
figures_df_regdocs['Title'].nunique()

(9136, 4)

Title          0
DataID         0
Page Number    0
Category       0
dtype: int64

(9134, 36)

3797

## Inspection file

In [45]:
inspection = figures_df_regdocs
inspection = inspection.rename(columns = {'Application title short' : 'Application Name', 'short_name' : 'Application Short Name', \
                              'file_name' : 'File Name', 'esa_folder_link' : 'ESA Folder URL', \
                               'esa_download_link' : 'PDF Download URL', 'Section' : 'Application Type (NEB Act)', \
                                'Column1' : 'Pipeline Status', 'Regulatory Instruments Issued' : 'Regulatory Instrument(s)', \
                              'Link to Application' : 'Application URL', 'Link to Decision' : 'Decision URL', \
                              'Component PDF Name' : 'ESA Section(s)', 'download_links' : 'Component download URL', \
                              'Page Number' : 'PDF Page#', 'Category' : 'Content Type', 'Application filing date' : 'Application Filing Date', \
                                'DocumentID' : 'Document Number', 'DataID' : 'Data ID', 'Location' : 'Pipeline Location', \
                            'Component Index' : 'ESA Section(s) Index', 'topics' : 'ESA Section(s) Topics', \
                            'download_csv' : 'CSV Download URL', 'download_jpg' : 'JPG Download URL', 'Page Number' : 'PDF Page Number', \
                                'page_count' : 'PDF Page Count', 'size (MB)' : 'PDF Size', 'Contains outline?' : 'PDF Outline',\
                                             'zip_folder_url' : 'Zipped Project Link'})

inspection['Data ID'] = inspection['Data ID'].astype(str)
inspection['PDF Page Number'] = inspection['PDF Page Number'].astype(str)

pdf_folder = 'F:/Environmental Baseline Data/Version 4 - Final/PDF/' + inspection['Data ID'] + '.pdf'
jpg_folder = 'F:/Environmental Baseline Data/Web/pdf_images/' + inspection['Data ID'] + '/' + inspection['PDF Page Number'] + '.jpg'
inspection.shape
inspection['pdf_path'] = pdf_folder
inspection['jpg_path'] = jpg_folder
inspection.shape

inspection.to_csv('F:/Environmental Baseline Data/Version 4 - Final/Indices/Inspection.csv', \
                     encoding = 'utf_8_sig', index = False)

(9134, 36)

(9134, 38)

## Combined Figures Regdocs and Tables Regdocs data

In [46]:
tables_figures_regdocs = pd.concat([figures_df_regdocs, tables_df_regdocs], sort = True)          
tables_figures_regdocs['DataID'] = tables_figures_regdocs['DataID'].astype(str)
tables_figures_regdocs['Page Number'] = tables_figures_regdocs['Page Number'].astype(str)
tables_figures_regdocs['Table Number'] = tables_figures_regdocs['Table Number'].astype(str)
tables_figures_regdocs.shape #(31048, 37)

(38025, 37)

In [47]:
#create download strings
csv = 'http://www.cer-rec.gc.ca/esa-ees/' +  tables_figures_regdocs['Download folder name'] + '/' + tables_figures_regdocs['DataID'] + '_' + tables_figures_regdocs['Page Number'] + '_' + tables_figures_regdocs['Table Number'] + '.csv'

zip_folder = 'http://www.cer-rec.gc.ca/esa-ees/' +  tables_figures_regdocs['Download folder name'] + '.zip' 

tables_figures_regdocs['download_csv'] = csv
tables_figures_regdocs['zip_folder_url'] = zip_folder

tables_figures_regdocs.loc[tables_figures_regdocs['download_csv'].str.contains('nan', na=False), 'download_csv'] = np.nan 
tables_figures_regdocs['download_csv'] = tables_figures_regdocs['download_csv'].str.replace('.0.csv', '.csv')

tables_figures_regdocs['download_csv'].to_csv('test.csv')
tables_figures_regdocs.shape
# (31048, 39)


(38025, 39)

## ESA Dataset FINAL for Tableau

In [48]:
esa_final_df = tables_figures_regdocs[['Title', 'Category', 'Application title short', 'short_name', 'Application filing date', 'Company Name',
                                 'Commodity', 'file_name', 'esa_folder_link', 'DocumentID', 'DataID', 
                                  'esa_download_link','Section', 'Location', 'Hearing order', 'Consultant Name', 'Column1' , 
                                  'Regulatory Instruments Issued', 'Link to Application', 'Link to Decision',
                                 'Component PDF Name', 'Component Index', 'topics','download_csv', 'Page Number',
                                'page_count', 'size (MB)', 'Contains outline?', 'Download folder name', 'zip_folder_url']]

esa_final_df = esa_final_df.rename(columns = {'Application title short' : 'Application Name', 'short_name' : 'Application Short Name', \
                              'file_name' : 'File Name', 'esa_folder_link' : 'ESA Folder URL', \
                               'esa_download_link' : 'PDF Download URL', 'Section' : 'Application Type (NEB Act)', \
                                'Column1' : 'Pipeline Status', 'Regulatory Instruments Issued' : 'Regulatory Instrument(s)', \
                              'Link to Application' : 'Application URL', 'Link to Decision' : 'Decision URL', \
                              'Component PDF Name' : 'ESA Section(s)', 'download_links' : 'Component download URL', \
                              'Page Number' : 'PDF Page#', 'Category' : 'Content Type', 'Application filing date' : 'Application Filing Date', \
                                'DocumentID' : 'Document Number', 'DataID' : 'Data ID', 'Location' : 'Pipeline Location', \
                            'Component Index' : 'ESA Section(s) Index', 'topics' : 'ESA Section(s) Topics', \
                            'download_csv' : 'CSV Download URL', 'Page Number' : 'PDF Page Number', \
                                'page_count' : 'PDF Page Count', 'size (MB)' : 'PDF Size', 'Contains outline?' : 'PDF Outline',\
                                             'zip_folder_url' : 'Zipped Project Link'})
                                                                                       
                                            
# esa_final_df.to_csv('F:/Environmental Baseline Data/Version 4 - Final/Tableau files/Prototype/jagoda edits/ESA_data.csv', \
#                     encoding = 'utf_8_sig')

# esa_final_df.to_csv('F:/Environmental Baseline Data/Version 4 - Final/Indices/ESA_data.csv', \
#                     encoding = 'utf_8_sig')
esa_final_df.columns
len(esa_final_df.columns)
esa_final_df.shape

Index(['Title', 'Content Type', 'Application Name', 'Application Short Name',
       'Application Filing Date', 'Company Name', 'Commodity', 'File Name',
       'ESA Folder URL', 'Document Number', 'Data ID', 'PDF Download URL',
       'Application Type (NEB Act)', 'Pipeline Location', 'Hearing order',
       'Consultant Name', 'Pipeline Status', 'Regulatory Instrument(s)',
       'Application URL', 'Decision URL', 'ESA Section(s)',
       'ESA Section(s) Index', 'ESA Section(s) Topics', 'CSV Download URL',
       'PDF Page Number', 'PDF Page Count', 'PDF Size', 'PDF Outline',
       'Download folder name', 'Zipped Project Link'],
      dtype='object')

30

(38025, 30)

## ESA FLAT FILE ENG

In [49]:
esa_ENG = esa_final_df
esa_ENG['Application Name'].unique()
esa_ENG['Decision URL'].isnull().sum()

esa_ENG['Pipeline Location'] = esa_ENG['Pipeline Location'].str.replace(', All', '')
esa_ENG['ESA Section(s) Topics'] = esa_ENG['ESA Section(s) Topics'].str.replace(', All', '')
esa_ENG.shape
esa_ENG.head(2)
# esa_ENG.to_csv('F:/Environmental Baseline Data/Version 4 - Final/Indices/ESA_website_ENG.csv', encoding = 'utf_8_sig')

array(['Application to Construct and Operate Ekwan Pipeline',
       'Application for the Keystone Pipeline',
       'Application for Line 13 Transfer, Line 13 Reversal and Capacity Replacement for the Southern Lights Project',
       'Application for the Alberta Clipper Expansion Project',
       'Application for the Line 4 Extension Project',
       'Application for the Cushing Expansion',
       'Application for Redwillow Pipeline Project',
       'Application to construct and operate the South Peace Pipeline Project',
       'Application for the Keystone XL Pipeline',
       'Application for the Groundbirch Pipeline Project',
       'Application for the Horn River Project',
       'Application for the Enbridge Northern Gateway Pipeline Project',
       'Application for Bakken Pipeline Project Canada',
       'Application for the Vantage Pipeline Project',
       'Application for the Northwest Mainline Expansion',
       'Application for the Leismer to Kettle River Crossover',
     

8729

(38025, 30)

Unnamed: 0,Title,Content Type,Application Name,Application Short Name,Application Filing Date,Company Name,Commodity,File Name,ESA Folder URL,Document Number,...,ESA Section(s),ESA Section(s) Index,ESA Section(s) Topics,CSV Download URL,PDF Page Number,PDF Page Count,PDF Size,PDF Outline,Download folder name,Zipped Project Link
0,Figure 13.1-1 EnCana Ekwan Pipeline,Figure,Application to Construct and Operate Ekwan Pip...,Ekwan,2003-03-17,EnCana Ekwan Pipeline Inc.,Gas,A0H8C0 - 13.0 EIA - Section 13.1 to 13.6,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C0,...,"Section 13.1: Introduction, Section 13.1: Proj...",1.0,"Land, Air, Vegetation",,26,107.0,1.41,Yes,kwn,http://www.cer-rec.gc.ca/esa-ees/kwn.zip
1,Figure 13.3-1 CEA Framework,Figure,Application to Construct and Operate Ekwan Pip...,Ekwan,2003-03-17,EnCana Ekwan Pipeline Inc.,Gas,A0H8C0 - 13.0 EIA - Section 13.1 to 13.6,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C0,...,"Section 13.1: Introduction, Section 13.1: Proj...",1.0,"Land, Air, Vegetation",,41,107.0,1.41,Yes,kwn,http://www.cer-rec.gc.ca/esa-ees/kwn.zip


## ESA FLAT FILE FRA

In [50]:
esa_FRA = esa_ENG
esa_FRA = esa_FRA.rename(columns = {'Title' : 'Titre', 'Content Type' : 'Type de contenu', 'Application Name' : 'Nom de la demande', \
                                   'Application Short Name' : 'Nom abrégé de la demande', 'Application Filing Date' : 'Dépôt de la demande', \
                                   'Company Name' : 'Nom de la société', 'Commodity' : 'Produit de base', 'File Name' : 'Nom de fichier', \
                                   'ESA Folder URL' : 'URL du dossier de l’ÉES', 'Document Number': 'Numéro de document', \
                                    'Data ID' : 'Identificateur de données', 'PDF Download URL' : 'URL de téléchargement PDF', \
                                    'Application Type (NEB Act)' : 'Type de demande (Loi sur l’Office national de l’énergie)', \
                                    'Pipeline Location' : 'Emplacement du pipeline', 'Hearing order' : 'Ordonnance d’audience', \
                                    'Consultant Name' : 'Nom du consultant', 'Pipeline Status' : "État d'avancement", \
                                    'Regulatory Instrument(s)' : 'Instruments réglementaires', 'Application URL' : 'URL de la demande', \
                                    'Decision URL' : 'URL de la décision', 'ESA Section(s)' : 'Sections de l’EES', \
                                    'ESA Section(s) Index' : 'Index des sections de l’ÉES', 'ESA Section(s) Topics' : 'Sujets des sections de l’ÉES' , \
                                    'CSV Download URL' : 'URL de téléchargement CSV', 'PDF Page Number' : 'Numéro de page PDF', \
                                    'PDF Page Count' : 'Nombre de pages PDF', 'PDF Size' : 'Taille PDF' , 'PDF Outline' : 'Aperçu PDF', \
                                    'Download folder name' : 'Télécharger le nom du dossier',  'Zipped Project Link' : 'Lien vers le projet compressé'})

esa_FRA['Emplacement du pipeline'] = esa_FRA['Emplacement du pipeline'].str.replace('British Columbia', 'Colombie britannique')\
            .str.replace('Northwest Territories', 'Territoires du nord-ouest').str.replace('Quebec', 'Québec')\
            .str.replace('New Brunswick', 'Nouveau-Brunswick').str.replace('Nova Scotia', 'Nouvelle-Écosse')

esa_FRA['Type de demande (Loi sur l’Office national de l’énergie)'] = esa_FRA['Type de demande (Loi sur l’Office national de l’énergie)']\
        .str.replace('Pipeline Abandonment', 'Projets de cessation d’exploitation de pipeline')\
        .str.replace('Large Projects \(over 40 km\)', 'Projets de grande envergure (plus de 40 km)')\
        .str.replace('Small Projects \(under 40 km\)', 'Petits projets pipeliniers (moins de 40 km)')

esa_FRA['Type de contenu'] = esa_FRA['Type de contenu'].str.replace('Table', 'Tableau')

esa_FRA['Produit de base'] = esa_FRA['Produit de base'].str.replace('Gas', 'Gaz').str.replace('Oil', 'Pétrole')

esa_FRA["État d'avancement"] = esa_FRA["État d'avancement"].str.replace('Operating', 'En exploitation' )\
        .str.replace('Abandonment Pending', 'Cessation d’exploitation en instance')\
        .str.replace('Approved', 'Demande approuvée')\
        .str.replace('Revoked Certificate', 'Certificats révoqués')\
        .str.replace('Withdrawn', 'Demande retirée')\
        .str.replace('Rescinded Certificates', 'Certificat annulé')\
        .str.replace('Applied', 'Demande présentée')

esa_FRA['Sujets des sections de l’ÉES'] = esa_FRA['Sujets des sections de l’ÉES'].str.replace('Water', 'Eau')\
    .str.replace('Land', 'Terres').str.replace('Wildlife', 'Faune').str.replace('Vegetation', 'Végétation')\
    .str.replace('Human', 'Aspect humain').str.replace('Technology', 'Aspect technique')\
    .str.replace('Alignment Sheet','Carte-tracé')\
    .str.replace('Environment Protection Plan', 'Plan de protection de l’environnement')\
    .str.replace('Traditional Knowledge', 'Utilisation des terres à des fins traditionnelles')

esa_FRA['Nom de la demande'] = esa_FRA['Nom de la demande'].str.replace('Application for the Construction of North Corridor Expansion Project', 'Demande visant la construction du projet d’agrandissement du couloir nord')\
    .str.replace('Application for the Construction of Edson Mainline Expansion Project','Demande visant la construction du projet d’agrandissement du réseau principal à Edson')\
    .str.replace('Application for 2021 NGTL System Expansion Project', 'Demande visant le projet d’agrandissement du réseau de NGTL en 2021')\
    .str.replace('Application for Leave to Abandon Deep Panuke Pipeline', 'Demande visant la cessation d’exploitation du pipeline Deep Panuke')\
    .str.replace('Application for the Goldboro Gas Plant and 26" Gathering Pipeline Abandonment', 'Demande visant la cessation d’exploitation de l’usine à gaz Goldboro et du pipeline de collecte de 26 po')\
    .str.replace('Application for the construction of the West Path Delivery Project','Demande visant la construction du projet de livraison parcours ouest')\
    .str.replace('Application for the Spruce Ridge Program', 'Demande visant le programme Spruce Ridge')\
    .str.replace('Application for the Wyndwood Pipeline Expansion Project', 'Demande visant le projet d’agrandissement du pipeline Wyndwood')\
    .str.replace('Application for the Albersun Pipeline Asset Purchase G', 'Demande concernant l’acquisition du pipeline Albersun')\
    .str.replace('Application for the Towerbirch Expansion Project', 'Demande concernant le projet d’expansion Towerbirch')\
    .str.replace('Application for the 2017 NGTL System Expansion', 'Demande visant l’agrandissement du réseau de NGTL en 2017')\
    .str.replace('Application for the Line 3 Replacement Program', 'Demande visant le programme de remplacement de la canalisation 3')\
    .str.replace('Applications for Energy East, Asset Transfer and Eastern Mainline \(Eastern Mainline ESA\)', 'Demandes visant Énergie Est, la cession d’actifs et le Réseau principal Est (évaluation environnementale et socioéconomique du Réseau principal Est)')\
    .str.replace('Applications for Energy East, Asset Transfer and Eastern Mainline \(Energy East ESA\)', 'Demandes visant Énergie Est, la cession d’actifs et le Réseau principal Est (évaluation environnementale et socioéconomique d’Énergie Est)')\
    .str.replace('Application for the Wolverine River Lateral Loop Carmon Creek Section', 'Demande visant le tronçon Carmon Creek du doublement de la canalisation latérale Wolverine River')\
    .str.replace('Application for Trans Mountain Expansion Project', 'Demande visant le projet d’agrandissement du réseau de Trans Mountain')\
    .str.replace('Application for North Montney Project', 'Demande visant le projet North Montney')\
    .str.replace('Application for the Edmonton to Hardisty Pipeline Project', 'Demande visant le projet pipelinier d’Edmonton à Hardisty')\
    .str.replace('Application for Northwest Mainline Komie North Extension', 'Demande visant le prolongement Komie Nord du réseau principal Nord-Ouest')\
    .str.replace('Application for Line 9 Reversal Phase I Project', 'Demande relative à la première étape du projet d’inversion de la canalisation 9')\
    .str.replace('Application for the Leismer to Kettle River Crossover', 'Demande visant le pipeline de croisement de Leismer à Kettle River')\
    .str.replace('Application for the Northwest Mainline Expansion', 'Demande visant l’agrandissement du réseau principal Nord-Ouest')\
    .str.replace('Application for the Vantage Pipeline Project', 'Demande relative au projet pipelinier Vantage')\
    .str.replace('Application for Bakken Pipeline Project Canada', 'Demande visant le projet de pipeline Bakken Canada')\
    .str.replace('Application for the Enbridge Northern Gateway Pipeline Project', 'Demande visant le projet Northern Gateway d’Enbridge')\
    .str.replace('Application for the Horn River Project', 'Demande relative au projet de Horn River')\
    .str.replace('Application for the Groundbirch Pipeline Project', 'Demande relative au projet pipelinier Groundbirch')\
    .str.replace('Application for the Keystone XL Pipeline', 'Demande relative au pipeline Keystone XL')\
    .str.replace('Application to construct and operate the South Peace Pipeline Project', 'Demande visant la construction et l’exploitation du projet de pipeline South Peace')\
    .str.replace('Application for Redwillow Pipeline Project', 'Demande relative au projet pipelinier Redwillow')\
    .str.replace('Application for the Cushing Expansion', 'Demande visant l’agrandissement Cushing')\
    .str.replace('Application for the Line 4 Extension Project', 'Demande visant le projet de prolongement de la canalisation 4')\
    .str.replace('Application for the Alberta Clipper Expansion Project', 'Demande visant le projet d’agrandissement Alberta Clipper')\
    .str.replace('Application for Line 13 Transfer, Line 13 Reversal and Capacity Replacement for the Southern Lights Project', 'Demande visant le transfert de propriété et l’inversion de la canalisation 13 et le remplacement de capacité pour le projet Southern Lights')\
    .str.replace('Application for the Brunswick Pipeline Project', 'Demande relative au projet de Gazoduc Brunswick')\
    .str.replace('Application for the construction and operation of the Mackenzie Gas Pipeline', 'Demande visant la construction et l’exploitation du gazoduc Mackenzie')\
    .str.replace('Application to Construct and Operate Ekwan Pipeline', 'Demande visant la construction et l’exploitation du pipeline Ekwan')

esa_FRA['Aperçu PDF'] = esa_FRA['Aperçu PDF'].str.replace('Yes', 'Oui').str.replace('No', 'Non')
esa_FRA['URL de la décision'] = esa_FRA['URL de la décision'].str.replace('.Item.View.', '/%C3%89l%C3%A9ment/Afficher/')
# esa_FRA.to_csv('F:/Environmental Baseline Data/Version 4 - Final/Indices/ESA_website_FRA.csv', encoding = 'utf_8_sig')

esa_FRA.head(2)
esa_FRA.shape

Unnamed: 0,Titre,Type de contenu,Nom de la demande,Nom abrégé de la demande,Dépôt de la demande,Nom de la société,Produit de base,Nom de fichier,URL du dossier de l’ÉES,Numéro de document,...,Sections de l’EES,Index des sections de l’ÉES,Sujets des sections de l’ÉES,URL de téléchargement CSV,Numéro de page PDF,Nombre de pages PDF,Taille PDF,Aperçu PDF,Télécharger le nom du dossier,Lien vers le projet compressé
0,Figure 13.1-1 EnCana Ekwan Pipeline,Figure,Demande visant la construction et l’exploitati...,Ekwan,2003-03-17,EnCana Ekwan Pipeline Inc.,Gaz,A0H8C0 - 13.0 EIA - Section 13.1 to 13.6,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C0,...,"Section 13.1: Introduction, Section 13.1: Proj...",1.0,"Terres, Air, Végétation",,26,107.0,1.41,Oui,kwn,http://www.cer-rec.gc.ca/esa-ees/kwn.zip
1,Figure 13.3-1 CEA Framework,Figure,Demande visant la construction et l’exploitati...,Ekwan,2003-03-17,EnCana Ekwan Pipeline Inc.,Gaz,A0H8C0 - 13.0 EIA - Section 13.1 to 13.6,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C0,...,"Section 13.1: Introduction, Section 13.1: Proj...",1.0,"Terres, Air, Végétation",,41,107.0,1.41,Oui,kwn,http://www.cer-rec.gc.ca/esa-ees/kwn.zip


(38025, 30)

## ESA Short File for LSU

In [51]:
lsu = esa_final_df
lsu = lsu.sort_values(by = ['Application Filing Date', 'Content Type'])
lsu['ESA Section(s) Topics'] = lsu['ESA Section(s) Topics'].str.replace(', All', '')
lsu = lsu.head(300)
lsu['Application Name'].unique()
lsu.to_excel('F:/Environmental Baseline Data/Version 4 - Final/Indices/ESA_data_lsu.xlsx', \
                    encoding = 'utf_8_sig')

array(['Application to Construct and Operate Ekwan Pipeline',
       'Application for the construction and operation of the Mackenzie Gas Pipeline'],
      dtype=object)

## ESA GIT HUB FILE

In [52]:
github_df = df_esa_with_gis_components_short_names_topics_page_count[['Application title short', 'short_name', \
    'Application filing date', 'Company Name','Commodity', 'file_name', 'esa_folder_link', 'DocumentID', 'DataID', \
     'esa_download_link','Section', 'Location', 'Hearing order', 'Consultant Name', 'Column1' , \
    'Regulatory Instruments Issued', 'Link to Application', 'Link to Decision', 'Component PDF Name', 
        'Component Index']]

github_df = github_df.rename(columns = {'Application title short' : 'Application Name', 'short_name' : 'Application Short Name', \
                            'file_name' : 'File Name', 'esa_folder_link' : 'ESA Folder URL', \
                            'esa_download_link' : 'PDF Download URL', 'Section' : 'Application Type (NEB Act)', \
                            'Column1' : 'Pipeline Status', 'Regulatory Instruments Issued' : 'Regulatory Instrument(s)', \
                            'Link to Application' : 'Application URL', 'Link to Decision' : 'Decision URL', \
                            'Component PDF Name' : 'ESA Section(s)', 'download_links' : 'Component download URL', \
                            'Page Number' : 'PDF Page#', 'Application filing date' : 'Application Filing Date', \
                            'DocumentID' : 'Document Number', 'DataID' : 'Data ID', 'Location' : 'Pipeline Location', \
                            'Component Index' : 'ESA Section(s) Index'})

github_df['Pipeline Location'] = github_df['Pipeline Location'].str.replace(', All', '')

github_df.columns
len(github_df.columns)
github_df.shape
github_df.head()
github_df.shape
# github_df.to_csv('F:/Environmental Baseline Data/Version 4 - Final/Indices/Github_ESA_Final_CERdb.csv', \
#                      encoding = 'utf_8_sig', index = False)

github_df.to_csv('F:/Environmental Baseline Data/Version 4 - Final/Indices/Github_ESA_Final1.csv', \
                     encoding = 'utf_8_sig', index = False)

Index(['Application Name', 'Application Short Name', 'Application Filing Date',
       'Company Name', 'Commodity', 'File Name', 'ESA Folder URL',
       'Document Number', 'Data ID', 'PDF Download URL',
       'Application Type (NEB Act)', 'Pipeline Location', 'Hearing order',
       'Consultant Name', 'Pipeline Status', 'Regulatory Instrument(s)',
       'Application URL', 'Decision URL', 'ESA Section(s)',
       'ESA Section(s) Index'],
      dtype='object')

20

(1994, 20)

Unnamed: 0,Application Name,Application Short Name,Application Filing Date,Company Name,Commodity,File Name,ESA Folder URL,Document Number,Data ID,PDF Download URL,Application Type (NEB Act),Pipeline Location,Hearing order,Consultant Name,Pipeline Status,Regulatory Instrument(s),Application URL,Decision URL,ESA Section(s),ESA Section(s) Index
0,Application to Construct and Operate Ekwan Pip...,Ekwan,2003-03-17,EnCana Ekwan Pipeline Inc.,Gas,A0H8C0 - 13.0 EIA - Section 13.1 to 13.6,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C0,268706,https://apps.cer-rec.gc.ca/REGDOCS/File/Downlo...,Large Projects (over 40 km),"Alberta, British Columbia",GH-1-2003,AXYS Environmental Consulting Ltd.,Operating,GC-108,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,"Section 13.1: Introduction, Section 13.1: Proj...",1.0
1,Application to Construct and Operate Ekwan Pip...,Ekwan,2003-03-17,EnCana Ekwan Pipeline Inc.,Gas,A0H8C1 - 13.0 EIA - Section 13.7 Wildlife Part 1,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C1,268709,https://apps.cer-rec.gc.ca/REGDOCS/File/Downlo...,Large Projects (over 40 km),"Alberta, British Columbia",GH-1-2003,AXYS Environmental Consulting Ltd.,Operating,GC-108,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,Section 13.7: Wildlife and Wildlife Habitat,2.0
2,Application to Construct and Operate Ekwan Pip...,Ekwan,2003-03-17,EnCana Ekwan Pipeline Inc.,Gas,A0H8C2 - 13.0 EIA - Section 13.7 Wildlife Part 2,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C2,268712,https://apps.cer-rec.gc.ca/REGDOCS/File/Downlo...,Large Projects (over 40 km),"Alberta, British Columbia",GH-1-2003,AXYS Environmental Consulting Ltd.,Operating,GC-108,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,Section 13.7: Wildlife and Wildlife Habitat,3.0
3,Application to Construct and Operate Ekwan Pip...,Ekwan,2003-03-17,EnCana Ekwan Pipeline Inc.,Gas,A0H8C3 - 13.0 EIA - Section 13.8 to 13.13,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C3,269018,https://apps.cer-rec.gc.ca/REGDOCS/File/Downlo...,Large Projects (over 40 km),"Alberta, British Columbia",GH-1-2003,AXYS Environmental Consulting Ltd.,Operating,GC-108,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,"Section 13.8: Fisheries and Aquatic Resources,...",4.0
4,Application to Construct and Operate Ekwan Pip...,Ekwan,2003-03-17,EnCana Ekwan Pipeline Inc.,Gas,A0H8C4 - 13.1 App 13A - Alignment Sheets,https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadRe...,A0H8C4,269021,https://apps.cer-rec.gc.ca/REGDOCS/File/Downlo...,Large Projects (over 40 km),"Alberta, British Columbia",GH-1-2003,AXYS Environmental Consulting Ltd.,Operating,GC-108,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/2...,Appendix 13A: Environmental Alignment Sheets,5.0


(1994, 20)

### CID File

In [87]:
df_table_titles['Page Number'] = df_table_titles['Page Number'].astype(str)
df_table_titles['Table Number'] = df_table_titles['Table Number'].astype(str)
df_table_titles['csv_file'] = df_table_titles['DataID'] + '_' + df_table_titles['Page Number'] + '_lattice-v_' + df_table_titles['Table Number'] + '.csv'

tables_only = df_table_titles['csv_file'].tolist()

csv_list_path = ['F:/Environmental Baseline Data/Version 4 - Final/all_csvs/' + x for x in tables_only] #May 4 edit
csv_list_path[0:4]
len(csv_list_path)
# csvs_tables = [ x for x in csvs if "nan" not in x ]
# csvs_tables =  [x.split('/', 5)[-1] for x in csvs_tables]
# csv_list_path = ['F:/Environmental Baseline Data/Version 4 - Final/CSV_final/' + x for x in csvs_tables]
# csv_list_path = [x.replace('.0', '') for x in csv_list_path]

['F:/Environmental Baseline Data/Version 4 - Final/all_csvs/1059614_14_lattice-v_1.csv',
 'F:/Environmental Baseline Data/Version 4 - Final/all_csvs/1059614_17_lattice-v_1.csv',
 'F:/Environmental Baseline Data/Version 4 - Final/all_csvs/1059614_18_lattice-v_1.csv',
 'F:/Environmental Baseline Data/Version 4 - Final/all_csvs/1059614_19_lattice-v_1.csv']

28891

In [88]:
%%time 
#Wall time:  3min 41s
#go through each CSV file and extract shape, and text; save to dict_pandas

exceptions_files = []
dict_pandas = []

for x in csv_list_path:
    try:
#         print(x)
        df = pd.read_csv(x)
        shape = df.shape
        first_row = str(list(df.iloc[:,0]))
        columns = list(df.columns)       
        onestring = str(df.values) #df = df.astype(str)
        dictionary = {'csv_path' : x, 'shape' : (shape), 'columns_index' : columns, 'row_index' : first_row, 'text_df_all' : onestring }
        dict_pandas.append(dictionary) 
    except:
        exceptions_files.append(x)
len(dict_pandas)

Wall time: 3min 41s


28891

In [89]:
# %%time 
#Wall time: 1.39 s
#read dictionary
df_csv_all = pd.DataFrame(dict_pandas)
df_csv_all.columns
df_csv_all['csv_name'] = df_csv_all['csv_path'].str.split('/').str[4]
df_csv_all['contains_cids'] = np.where(df_csv_all['text_df_all'].str.contains("cid:", case=False, na=False), 'contains cids', 'no cids')          
df_csv_all['contains_cids'].value_counts()
df_csv_all_cids_only = df_csv_all[df_csv_all['contains_cids'] == 'contains cids']

df_csv_all_cids_only['DataID'] = df_csv_all_cids_only['csv_name'].str.extract('(^\d+)')

df_csv_all_cids_only['Table_number'] = df_csv_all_cids_only['csv_name'].str.extract('(_\d+_)')
df_csv_all_cids_only['Table_number'] = df_csv_all_cids_only['Table_number'].str.replace('_', '')

df_csv_all_cids_only['Page_number'] = df_csv_all_cids_only['csv_name'].str.extract('(\d+.csv)')
df_csv_all_cids_only['Page_number'] = df_csv_all_cids_only['Page_number'].str.replace('.csv', '')

# df_csv_all_cids_only.loc[:, 'DataID'] = df_csv_all_cids_only['csv_name'].str.extract('(^\d+)')

Index(['csv_path', 'shape', 'columns_index', 'row_index', 'text_df_all'], dtype='object')

no cids          28146
contains cids      745
Name: contains_cids, dtype: int64

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the 

In [90]:
for_cids = tables_figures_regdocs[['DataID', 'Hearing order', 'short_name', 'zip_folder_url', 'Download folder name']]
for_cids = for_cids.drop_duplicates()

df_csv_all_cids_only_all = df_csv_all_cids_only.merge(for_cids, on = 'DataID')

df_csv_all_cids_only_all.to_excel('F:/Environmental Baseline Data/Version 4 - Final/Indices/Index3_full_cids_only_May4.xlsx', encoding = 'utf_8_sig')
df_csv_all_cids_only_all.to_csv('F:/Environmental Baseline Data/Version 4 - Final/Indices/Index3_full_cids_only_May4.csv', encoding = 'utf_8_sig')

df_csv_all_cids_only_all.shape
df_csv_all_cids_only_all.head()

df_csv_all_cids_only_all.to_excel('test.xlsx')

(745, 14)

Unnamed: 0,csv_path,shape,columns_index,row_index,text_df_all,csv_name,contains_cids,DataID,Table_number,Page_number,Hearing order,short_name,zip_folder_url,Download folder name
0,F:/Environmental Baseline Data/Version 4 - Fin...,"(10, 2)","[PROPONENT INFORMATION, Unnamed: 1]",['NAME:STREET ADDRESS:CITY/TOWN:PROVINCE/TERRI...,[['NAME:STREET ADDRESS:CITY/TOWN:PROVINCE/TERR...,1059875_118_lattice-v_1.csv,contains cids,1059875,118,1,GH-001-2014,North Montney,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn
1,F:/Environmental Baseline Data/Version 4 - Fin...,"(10, 2)","[PROPONENT INFORMATION, Unnamed: 1]",['NAME:STREET ADDRESS:CITY/TOWN:PROVINCE/TERRI...,[['NAME:STREET ADDRESS:CITY/TOWN:PROVINCE/TERR...,1059875_122_lattice-v_1.csv,contains cids,1059875,122,1,GH-001-2014,North Montney,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn
2,F:/Environmental Baseline Data/Version 4 - Fin...,"(10, 2)","[PROPONENT INFORMATION, Unnamed: 1]",['NAME:STREET ADDRESS:CITY/TOWN:PROVINCE/TERRI...,[['NAME:STREET ADDRESS:CITY/TOWN:PROVINCE/TERR...,1059875_127_lattice-v_1.csv,contains cids,1059875,127,1,GH-001-2014,North Montney,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn
3,F:/Environmental Baseline Data/Version 4 - Fin...,"(10, 2)","[PROPONENT INFORMATION, Unnamed: 1]",['NAME:STREET ADDRESS:CITY/TOWN:PROVINCE/TERRI...,[['NAME:STREET ADDRESS:CITY/TOWN:PROVINCE/TERR...,1059875_131_lattice-v_1.csv,contains cids,1059875,131,1,GH-001-2014,North Montney,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn
4,F:/Environmental Baseline Data/Version 4 - Fin...,"(12, 10)","[Common Name, Scientific Name, Conservation St...","['Common Name', 'SONGBIRDS, CORVIDS, GROUSE AN...",[['Common Name' 'Scientific Name' 'Alberta Pro...,2392725_68_lattice-v_1.csv,contains cids,2392725,68,1,OH-001-2014,Trans Mountain Expansion,http://www.cer-rec.gc.ca/esa-ees/tmx.zip,tmx


In [91]:
df_csv_all_cids_only_all.shape
# df_csv_all_cids_only_all.dtypes
# df_csv_all_cids_only_all.head()
# df_csv_all_cids_only_all.drop_duplicates()
# df_csv_all_cids_only_all.shape

df_csv_all_cids_only_all['short_name'].value_counts()

(745, 14)

Eastern Mainline            414
Brunswick                   141
Mackenzie Gas               103
Alberta Clipper              31
Trans Mountain Expansion     29
Northwest Mainline            6
Keystone                      5
Line 4                        4
North Montney                 4
Edmonton to Hardisty          3
Komie North                   2
Keystone XL                   1
South Peace                   1
Horn River                    1
Name: short_name, dtype: int64