In [258]:
import pandas as pd
import os
import glob
import shutil
from time import gmtime, strftime

In [261]:
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', 255)

In [262]:
# filepath to the English and French index files
ENG_index_filepath = 'F:/Environmental Baseline Data/Version 4 - Final/Indices/ESA_website_ENG.csv'
FRA_index_filepath = 'F:/Environmental Baseline Data/Version 4 - Final/Indices/ESA_website_FRA.csv'

In [263]:
# Loading index file of all tables
df = pd.read_csv(ENG_index_filepath)
df_FRA = pd.read_csv(FRA_index_filepath)

In [264]:
# Remove all rows for figures so that we are only moving tables
df = df[df['Content Type'] == 'Table']
df_FRA = df_FRA[df_FRA['Type de contenu'] == 'Tableau']

In [265]:
# Dropping old index column to create new one
df.drop(columns=['Unnamed: 0'], inplace=True)
df = df.reset_index()
df.rename(columns = {"index": "Index"}, inplace = True) 

df_FRA.drop(columns=['Unnamed: 0'], inplace=True)
df_FRA = df_FRA.reset_index()
df_FRA.rename(columns = {"Index": "Indice"}, inplace = True)

In [269]:
# Creating the names of each csv file
df['filename'] = df['Download folder name'] + '-' + df['Title'].str.lower().str.replace('(', '').str.replace(')', '').str.replace(' ', '-').str.replace('.', '-').str.replace('[^\w+-]', '').str.slice(0,80)

df_FRA['nom_du_fichier'] = df_FRA['Télécharger le nom du dossier'] + '-' + df_FRA['Titre'].str.lower().str.replace('(', '').str.replace(')', '').str.replace(' ', '-').str.replace('.', '-').str.replace('[^\w+-]', '').str.slice(0,80)

In [270]:
# Creating a column with the old filename so that we can rename the files
old_filename_df = df['CSV Download URL'].str.split('/').str[-1].str.split('_')
df['old_filename'] = old_filename_df.str[0] + '_' + old_filename_df.str[1] + '_lattice-v_' + old_filename_df.str[2]

vieux_nom_du_fichier_df = df_FRA['URL de téléchargement CSV'].str.split('/').str[-1].str.split('_')
df_FRA['vieux_nom_de_fichier'] = vieux_nom_du_fichier_df.str[0] + '_' + vieux_nom_du_fichier_df.str[1] + '_lattice-v_' + vieux_nom_du_fichier_df.str[2]

In [271]:
%%time
# We add a counter for all CSVs connected to the same table
# For the English index file
prev_title = ''
for index, row in df.iterrows():
    current_title = row['filename']
    if current_title == prev_title:
        current_title = current_title + '-' + 'pt' + str(i)
        i += 1
    else:
        i = 1
        current_title = current_title + '-' + 'pt' + str(i)
    
    df.loc[index, 'filename'] = current_title
    df.loc[index, 'CSV Download URL'] = os.path.join('http://www.cer-rec.gc.ca/esa-ees/', row['Download folder name'] + '/' + current_title + '.csv')
    prev_title = row['filename']

Wall time: 2min 15s


In [272]:
df.head(5)

Unnamed: 0,Title,Content Type,Application Name,Application Short Name,Application Filing Date,Company Name,Commodity,File Name,ESA Folder URL,Document Number,Data ID,PDF Download URL,Application Type (NEB Act),Pipeline Location,Hearing order,Consultant Name,Pipeline Status,Regulatory Instrument(s),Application URL,Decision URL,ESA Section(s),ESA Section(s) Index,ESA Section(s) Topics,CSV Download URL,PDF Page Number,PDF Page Count,PDF Size,PDF Outline,Download folder name,Zipped Project Link,filename,old_filename
0,TABLE 3 SUMMARY OF AQUATICS FIELD WORK AND ABORIGINAL FIELD STUDY PARTICIPATION FOR THE PROJECT,Table,Application for North Montney Project,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Large Projects (over 40 km),British Columbia,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",Operating,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Water,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-3-summary-of-aquatics-field-work-and-aboriginal-field-study-participation-for-the-project-pt-1-pg-14-doc-num-A3Q6H2.csv,14,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-3-summary-of-aquatics-field-work-and-aboriginal-field-study-participation-for-the-project-pt-1-pg-14-doc-num-A3Q6H2,1059614_14_lattice-v_1.csv
1,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG THE NORTH MONTNEY MAINLINE (AITKEN CREEK SECTION),Table,Application for North Montney Project,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Large Projects (over 40 km),British Columbia,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",Operating,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Water,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-4-summary-of-watercourse-crossings-along-the-north-montney-mainline-aitken-creek-section-pt-1-pg-17-doc-num-A3Q6H2.csv,17,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-4-summary-of-watercourse-crossings-along-the-north-montney-mainline-aitken-creek-section-pt-1-pg-17-doc-num-A3Q6H2,1059614_17_lattice-v_1.csv
2,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG THE NORTH MONTNEY MAINLINE (AITKEN CREEK SECTION),Table,Application for North Montney Project,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Large Projects (over 40 km),British Columbia,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",Operating,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Water,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-4-summary-of-watercourse-crossings-along-the-north-montney-mainline-aitken-creek-section-pt-2-pg-18-doc-num-A3Q6H2.csv,18,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-4-summary-of-watercourse-crossings-along-the-north-montney-mainline-aitken-creek-section-pt-2-pg-18-doc-num-A3Q6H2,1059614_18_lattice-v_1.csv
3,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG THE NORTH MONTNEY MAINLINE (AITKEN CREEK SECTION),Table,Application for North Montney Project,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Large Projects (over 40 km),British Columbia,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",Operating,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Water,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-4-summary-of-watercourse-crossings-along-the-north-montney-mainline-aitken-creek-section-pt-3-pg-19-doc-num-A3Q6H2.csv,19,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-4-summary-of-watercourse-crossings-along-the-north-montney-mainline-aitken-creek-section-pt-3-pg-19-doc-num-A3Q6H2,1059614_19_lattice-v_1.csv
4,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG THE NORTH MONTNEY MAINLINE (AITKEN CREEK SECTION),Table,Application for North Montney Project,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gas,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Large Projects (over 40 km),British Columbia,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",Operating,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Water,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-4-summary-of-watercourse-crossings-along-the-north-montney-mainline-aitken-creek-section-pt-4-pg-20-doc-num-A3Q6H2.csv,20,48.0,5.87,No,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-4-summary-of-watercourse-crossings-along-the-north-montney-mainline-aitken-creek-section-pt-4-pg-20-doc-num-A3Q6H2,1059614_20_lattice-v_1.csv


In [106]:
%%time
# For French index file
prev_title = ''
for index, row in df_FRA.iterrows():
    current_title = row['nom_du_fichier']
    if current_title == prev_title:
        current_title = current_title + '-' + 'pt' + str(i)
        i += 1
    else:
        i = 1
        current_title = current_title + '-' + 'pt' + str(i)
    
    df_FRA.loc[index, 'nom_du_fichier'] = current_title
    df_FRA.loc[index, 'URL de téléchargement CSV'] = os.path.join('http://www.cer-rec.gc.ca/esa-ees/', row['Télécharger le nom du dossier'] + '/' + current_title + '.csv')
    prev_title = row['nom_du_fichier']

Wall time: 2min 1s


In [107]:
df_FRA.head(5)

Unnamed: 0,index,Titre,Type de contenu,Nom de la demande,Nom abrégé de la demande,Dépôt de la demande,Nom de la société,Produit de base,Nom de fichier,URL du dossier de l’ÉES,Numéro de document,Identificateur de données,URL de téléchargement PDF,Type de demande (Loi sur l’Office national de l’énergie),Emplacement du pipeline,Ordonnance d’audience,Nom du consultant,État d'avancement,Instruments réglementaires,URL de la demande,URL de la décision,Sections de l’EES,Index des sections de l’ÉES,Sujets des sections de l’ÉES,URL de téléchargement CSV,Numéro de page PDF,Nombre de pages PDF,Taille PDF,Aperçu PDF,Télécharger le nom du dossier,Lien vers le projet compressé,nom_du_fichier,vieux_nom_de_fichier
0,9134,TABLE 3 SUMMARY OF AQUATICS FIELD WORK AND ABORIGINAL FIELD STUDY PARTICIPATION FOR THE PROJECT,Tableau,Demande visant le projet North Montney,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gaz,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Projets de grande envergure (plus de 40 km),Colombie britannique,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",En exploitation,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/%C3%89l%C3%A9ment/Afficher/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Eau,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-3-summary-of-aquatics-fi-pg14-pt1.csv,14,48.0,5.87,Non,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-3-summary-of-aquatics-fi-pg14-pt1,1059614_14_lattice-v_1.csv
1,9135,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG THE NORTH MONTNEY MAINLINE (AITKEN CREEK SECTION),Tableau,Demande visant le projet North Montney,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gaz,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Projets de grande envergure (plus de 40 km),Colombie britannique,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",En exploitation,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/%C3%89l%C3%A9ment/Afficher/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Eau,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-4-summary-of-watercourse-pg17-pt1.csv,17,48.0,5.87,Non,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-4-summary-of-watercourse-pg17-pt1,1059614_17_lattice-v_1.csv
2,9136,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG THE NORTH MONTNEY MAINLINE (AITKEN CREEK SECTION),Tableau,Demande visant le projet North Montney,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gaz,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Projets de grande envergure (plus de 40 km),Colombie britannique,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",En exploitation,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/%C3%89l%C3%A9ment/Afficher/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Eau,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-4-summary-of-watercourse-pg18-pt2.csv,18,48.0,5.87,Non,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-4-summary-of-watercourse-pg18-pt2,1059614_18_lattice-v_1.csv
3,9137,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG THE NORTH MONTNEY MAINLINE (AITKEN CREEK SECTION),Tableau,Demande visant le projet North Montney,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gaz,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Projets de grande envergure (plus de 40 km),Colombie britannique,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",En exploitation,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/%C3%89l%C3%A9ment/Afficher/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Eau,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-4-summary-of-watercourse-pg19-pt3.csv,19,48.0,5.87,Non,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-4-summary-of-watercourse-pg19-pt3,1059614_19_lattice-v_1.csv
4,9138,TABLE 4 SUMMARY OF WATERCOURSE CROSSINGS ALONG THE NORTH MONTNEY MAINLINE (AITKEN CREEK SECTION),Tableau,Demande visant le projet North Montney,North Montney,2013-11-08,NOVA Gas Transmission Ltd.,Gaz,B2-16 ESA_Appendix_G_Part1of4 (A3Q6H2),https://apps.cer-rec.gc.ca/REGDOCS/Item/LoadResult/1060040,A3Q6H2,1059614,https://apps.cer-rec.gc.ca/REGDOCS/File/Download/1059614,Projets de grande envergure (plus de 40 km),Colombie britannique,GH-001-2014,"Stantec Consulting Ltd., TERA Environmental Consultants",En exploitation,GC-125,https://apps.cer-rec.gc.ca/REGDOCS/Item/View/1060220,https://apps.cer-rec.gc.ca/REGDOCS/%C3%89l%C3%A9ment/Afficher/3890551,Appendix G: TERA Aquatics Summary Report,15.0,Eau,http://www.cer-rec.gc.ca/esa-ees/nrthmntn/nrthmntn-table-4-summary-of-watercourse-pg20-pt4.csv,20,48.0,5.87,Non,nrthmntn,http://www.cer-rec.gc.ca/esa-ees/nrthmntn.zip,nrthmntn-table-4-summary-of-watercourse-pg20-pt4,1059614_20_lattice-v_1.csv


In [6]:
# Making sure there are no duplicates in English filenames
assert len(df) - len(df['filename'].unique()) == 0, "Should be 0."

In [None]:
# Making sure there are no duplicates in French filenames
assert len(df_FRA) - len(df_FRA['nom_du_fichier'].unique()) == 0, "Should be 0."

In [71]:
# Adding an index ID to each file to avoid duplicates
df['filename'] = df['filename'] + '-' + 'no' + df['Index'].astype(str) + '.csv'
df_FRA['nom_du_fichier'] = df_FRA['nom_du_fichier'] + '-' + 'no' + df_FRA['Indice'].astype(str) + '.csv'

In [294]:
# Where the CSVs are located
csv_folder_path_ENG = 'F:/Environmental Baseline Data/Version 4 - Final/all_csvs_cleaned_latest_ENG/'
csv_folder_path_FRA = 'F:/Environmental Baseline Data/Version 4 - Final/all_csvs_cleaned_latest_FRA/'

In [299]:
%%time
# English CSVs
# If the final file has been renamed, it will skip the renaming loop
os.chdir(csv_folder_path_ENG)
if os.path.isfile(df['old_filename'].iloc[-1]):
  #loop through the name and rename
    for index, row in df.iterrows():
        if os.path.isfile(row['old_filename']):
            shutil.move(row['old_filename'], row['filename'])  

yes
Wall time: 15.6 ms


In [None]:
%%time
# French CSVs
# If the final file has been renamed, it will skip the renaming loop
os.chdir(csv_folder_path_FRA)
if os.path.isfile(df['old_filename'].iloc[-1]):
  #loop through the name and rename
    for index, row in df.iterrows():
        if os.path.isfile(row['old_filename']):
            shutil.move(row['old_filename'], row['filename']) 

In [283]:
# Updating base path to Indices folder to save index files
os.chdir('F:/Environmental Baseline Data/Version 4 - Final/Indices/')

In [284]:
# Saving index files
df.to_csv('ESA_website_ENG_' + strftime("%Y_%m_%d", gmtime()) + '.csv')
df_FRA.to_csv('ESA_website_FRA_' + strftime("%Y_%m_%d", gmtime()) + '.csv', encoding='ISO-8859-1')