# Digital Transformation Advisory

## 01 - Document Collection

Purpose: to download a defined set of documents

In [None]:
#'''
# **************************************************************************************************************** #
#*****************************************  IDB - AUG Data Analytics  ******************************************** #
# **************************************************************************************************************** #
#
#-- Notebook Number: 01 - Document Collection
#-- Title: Digital Transformation Advisory
#-- Audit Segment: 
#-- Continuous Auditing: Yes
#-- System(s): Documents stored at IDBDocs, IDB SharePoint & ezShare
#-- Description: Download to a local folder all the selected documents:
#                - TCs 'Approval Registry' (excluded on 06/18)
#                - TCs 'Approval Documents' and 'NULL' URL ones
#                - Loans
#                - Grants
#                                
#
#-- @author:  Emiliano Colina <emilianoco@iadb.org>
#-- Version:  0.7
#-- Last Update: 07/15/2020
#-- Last Revision Date: 07/15/2020 - Emiliano Colina <emilianoco@iadb.org> 
#                                    

# **************************************************************************************************************** #
#'''

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

#### Environment Setup

In [None]:
import os
import pandas as pd

In [None]:
import time
import datetime

In [None]:
import requests
import re

In [None]:
import numpy as np

In [None]:
# Set working directory
main_dir = "C:\\Users\\emilianoco\\Desktop\\2020"
data_dir = "/Digital_Transformation"


os.chdir(main_dir + data_dir) # working directory set
print('Working folder set to: ' + os.getcwd()) # working directory check

# **************************************************************************************************************** #

#### Functions

In [None]:
#######       
file_extension = re.compile('\.[a-zA-Z]{3}[a-zA-Z]?$')  # regular expression corrected

In [None]:
def url_check(url):
    '''
    Description: Checks how to connect to a sharepoint/ezshare or idbdocs repository to download a file.
    (based on the pcr_validate() function)
    @author: emilianoco
    Version:
        - v0.1 - initial version (07/07/2020)
    '''
    
    url = url.strip() # remove trailing white spaces
    
    if ('idbdocs' in url) or ('ezws' in url):
        
        # protocol and host adjustment
        if url.startswith('http:'): # replace http with https
            url = url.replace('http', 'https')
        elif url.startswith('https://ezws'): # replace the ezws host with idbdocs
            url = url.replace('https://ezws', 'https://idbdocs')

        if 'EZSHARE' in url: 
            # Connect once using the cookie_2 (from idbdocs) to get the last url (in sharepoint), and then
            # connect using the sharepoint cookie
            return('connect_twice',url)

        else: 
            # connect using the cookie_2 (from idbdocs)
            return('idbdocs_directly',url)
            #r = requests.get(df_2['Link Descarga'][index], headers = h_idbdocs, allow_redirects = True)

    else:
        if 'www.iadb.org' in url:
            if 'EZSHARE' in url:
                url = url.replace('https://www.iadb.org/Document.cfm?id=', 'https://idbdocs.iadb.org/wsdocs/getdocument.aspx?docnum=')
                return('connect_twice',url)
            
        else: 
            if 'sharepoint' in url:
                
                return('sharepoint_directly', sharepoint_adjust(url))

In [None]:
def sharepoint_adjust(original):
    '''
    Description: Adjusts a sharepoint url to download the file it points to.
    @author: camilode; emilianoco
    
    Version: 
        - v0.3 - added control for trailing parameters in URL (07/04/2020)
        - v0.2 - Added control for url with path 'WopiFrame.aspx' 
        - v0.1 - (01/09/2020)
    '''
    #posicion_corte = 0
    #del posicion_corte
    if '{' in original:
        original = original.replace('{','%7B')
    if '}' in original:
        original = original.replace('}', '%7D')
    
    if '%7D' in original:
        posicion_corte = original.find('%7D')
        original = original[:posicion_corte]
    #print(original)

    if 'Doc.aspx' in original: 
        original = original.replace('Doc.aspx?sourcedoc=%7B','download.aspx?UniqueId=')
    
    if 'WopiFrame.aspx' in original:
        original = original.replace('WopiFrame.aspx?sourcedoc=%7B','download.aspx?UniqueId=')

    return(original)


In [None]:
def check_content(req):
    '''
    Description: Checks for certain messages/errors in a request content
    @ author: emilianoco
    Version:
        - v0.1 - initial version - (07/07/2020)
    '''
    if 'could not be found in Sharepoint EzShare' in str(req.content):
        return('not found')
    elif ('AccessDenied.aspx' in str(req.content)) or ('does not have permissions to access this resource' in str(req.content)): 
        return('access denied')
    else:
        return('content undefined')

In [None]:
def download_file(file_dir, req, name_prefix=''):
    '''
    Description: Saves to 'file_dir' the file under 'req' as 'file_name', obtained either from the URL or 
    the HTTP response. Optional parameter added to insert a prefix to the file_name.
    
    The function also controls 'file_dir' + 'file_name' lengths to avoid OS constraints.
    
    To control duplicates and not overwrite already downloaded files, the function iterates over
    the destination folder and adds a counter if the 'file_name' is already present.
    
    @ author: emilianoco
    
    Version:
        - v0.4 - optional parameter (07/07/2020)
        - v0.3 - (06/17/2020)
        - v0.2 - (06/16/2020)
        - v0.1 - (01/02/2020)
    '''
    
    if req.headers.get('Content-Disposition') == None: 
        #file_name not in 'Content-Disposition' but in in url - usually sharepoint
        if '&file=' in requests.utils.unquote(req.request.url.split('/')[-1], encoding='utf-8', errors='replace'):
            
            # case where the file_name is defined in parameter &file, usually a 'docx' file
            file_name_orig = requests.utils.unquote(req.request.url.split('/')[-1], encoding='utf-8', errors='replace').split('&file=')[-1].split('&')[0]
            
            # the request url needs to be re-written and a new connection is required:
            req = requests.get(sharepoint_adjust(req.url), headers = h_sharepoint, allow_redirects = True) 
            
        else:
            file_name_orig = requests.utils.unquote(req.request.url.split('/')[-1], encoding='utf-8', errors='replace')
    else:
        #file_name extracted from the content - usually idbdocs
        file_name_orig = requests.utils.unquote(req.headers['Content-Disposition'].split('filename=')[-1].encode('latin-1').decode('utf-8')).replace('"', '')
    
    
    # Set name_prefix (v0.4)
    if name_prefix != '':
        #not empty
        name_prefix = name_prefix + '_'
    
    
    # Check file_name length (v0.4)
    if len(file_dir + name_prefix + file_name_orig) > 240: 
        file_name_ini = file_name_orig[0:180]
        file_name_fin = file_name_orig[-20:]
        file_name = name_prefix + file_name_ini + '~' + file_name_fin
    else:
        file_name = name_prefix + file_name_orig
    

    # Check if file_name already present in destination folder (v0.4)
    if file_name in os.listdir(file_dir):
        file_name = ''.join(file_name.split('.')[:-1]) + '_' + '%s' + str('.') + file_name.split('.')[-1]
        i = 1
        while os.path.exists(file_dir + '\\' + file_name %i):
            i += 1
    
        # Save the file
        with open(file_dir + '\\' + file_name %i, 'wb') as f:
            f.write(req.content)
        print('Downloaded: ' + file_name %i)   #v0.3
        return file_name %i 
    
    else:
        with open(file_dir + '\\' + file_name, 'wb') as f:
            f.write(req.content)
        print('Downloaded: ' + file_name)
        return file_name

# **************************************************************************************************************** #

#### Headers configuration 

In [None]:
### The following variables must be set! ###
cookie_idbdocs = 'XXXXXX'    ## <----
cookie_sharepoint = 'YYYYYY' ## <----

In [None]:
# idbdocs
h_idbdocs = {
    'method': 'GET',
    'scheme': 'https',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'cookie': cookie_idbdocs,
    'accept-encoding': 'gzip, deflate, br',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', #Chrome
    }

In [None]:
# sharepoint
h_sharepoint = {
    'method': 'GET',
    'scheme': 'https',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'cookie': cookie_sharepoint,
    'accept-encoding': 'gzip, deflate, br',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36', #Chrome
    }

# **************************************************************************************************************** #

In [None]:
## cookies & headers clean-up:
#del cookie_idbdocs
#del cookie_sharepoint
#del h_idbdocs
#del h_sharepoint

In [None]:
# **************************************************************************************************************** #

# TCs

#### Data Preparation & Exploration

In [None]:
# Load source xlsx file:
data_ = pd.read_excel(r"./input/Data-01 July 2020.xlsx", sheet_name='data_filtered', encoding='latin1')

In [None]:
data_.head()

In [None]:
data_.shape

In [None]:
data_.columns

In [None]:
data = data_[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 
       'CREATED', 'CREATED_BY', 'MODIFIED', 'MODIFIED_BY', 'PROJECT_NUMBER', 
     'APPROVAL_NUMBER', 'FUND', 'FK_COUNTRY_ID', 'FK_SUB_SECTOR_ID',
    'DOCUMENT_NAME', 'URL']].copy()

In [None]:
# **************************************************************************************************************** #

In [None]:
# **************************************************************************************************************** #

In [None]:
# **************************************************************************************************************** #

### Approval Documents Collection

In [None]:
# We will work with the filtered dataframe, and we'll add additional columns for storing the results:
df = data.copy()

In [None]:
# for storing the document's name and its status, i.e.: 'downloaded', exception message 
df['Document_Name'] = '' #
df['Document_Status'] = '' #

In [None]:
df['Document_URL'] = ''

In [None]:
df.head()

In [None]:
# Destination folder setup: all files will be downloaded to 'file_dir'

desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir = desktop_dir + "\\Approvals"

print(file_dir)

### Approval Documents collection:

In [None]:
file_dir

In [None]:
%%time

t = 1     # counter set

for index, row in df.iterrows():
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(df['URL'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_test, r, df['OPERATION_NUMBER'][index]) 
        df.at[index, 'Document_Name'] = file_name
        df.at[index, 'Document_Status'] = 'OK - direct download'
        df.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df.at[index, 'Document_Name'] = 'not downloaded'
            df.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df.at[index, 'Document_Name'] = 'not downloaded'
                            df.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_test, s, df['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df.at[index, 'Document_Name'] = file_name
                                df.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df.at[index, 'Document_Name'] = 'Not downloaded'
                                df.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 7) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 7 docs
        print('')
    print('')

In [None]:
pending_1 = df[df.Document_Status == ''].index

In [None]:
# **************************************************************************************************************** #

In [None]:
%%time

# continuation of previous run, using pending_1 as index list

t = 1     # counter set

for index in pending_1:
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(df['URL'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_test, r, df['OPERATION_NUMBER'][index]) 
        df.at[index, 'Document_Name'] = file_name
        df.at[index, 'Document_Status'] = 'OK - direct download'
        df.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df.at[index, 'Document_Name'] = 'not downloaded'
            df.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df.at[index, 'Document_Name'] = 'not downloaded'
                            df.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_test, s, df['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df.at[index, 'Document_Name'] = file_name
                                df.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df.at[index, 'Document_Name'] = 'Not downloaded'
                                df.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 5) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 5 docs
        print('')
    print('')

In [None]:
# **************************************************************************************************************** #

In [None]:
df[480:485]

In [None]:
pending_2 = df[df.Document_Status == ''].index

In [None]:
%%time

# continuation of previous run, using pending_2 as index list

t = 1     # counter set

for index in pending_2:
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(df['URL'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_test, r, df['OPERATION_NUMBER'][index]) 
        df.at[index, 'Document_Name'] = file_name
        df.at[index, 'Document_Status'] = 'OK - direct download'
        df.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df.at[index, 'Document_Name'] = 'not downloaded'
            df.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df.at[index, 'Document_Name'] = 'not downloaded'
                            df.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_test, s, df['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df.at[index, 'Document_Name'] = file_name
                                df.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df.at[index, 'Document_Name'] = 'Not downloaded'
                                df.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 5) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 5 docs
        print('')
    print('')

In [None]:
# **************************************************************************************************************** #

In [None]:
df[1010:1020]

In [None]:
pending_3 = df[df.Document_Status == ''].index

In [None]:
%%time

# continuation of previous run, using pending_3 as index list

t = 1     # counter set

for index in pending_3:
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(df['URL'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_test, r, df['OPERATION_NUMBER'][index]) 
        df.at[index, 'Document_Name'] = file_name
        df.at[index, 'Document_Status'] = 'OK - direct download'
        df.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df.at[index, 'Document_Name'] = 'not downloaded'
            df.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df.at[index, 'Document_Name'] = 'not downloaded'
                            df.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_test, s, df['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df.at[index, 'Document_Name'] = file_name
                                df.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df.at[index, 'Document_Name'] = 'Not downloaded'
                                df.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 5) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 5 docs
        print('')
    print('')

In [None]:
df.Document_Status.value_counts()

# ************** #

# Store results and reporting

In [None]:
import joblib

In [None]:
# Store results: 
# v0.7 - 07/15: (loans)
joblib.dump(df_4, './output/' + 'Loans-Doc_Collection_2020-07-15_v07_.joblib' + '.bz2', compress=('bz2', 3))

In [None]:
df_4.to_excel('Loans-Doc_Collection_2020-07-15_v07.xlsx')

#### ~ ~ ~ ####
<br>

In [None]:
# Store results: 
# v0.6 - 07/14: (grants)
joblib.dump(df_3, './output/' + 'Grants_Approvals-Doc_Collection_2020-07-14_v06_.joblib' + '.bz2', compress=('bz2', 3))

In [None]:
df_3.to_excel('Grants_Approvals-Doc_Collection_2020-07-14_v06.xlsx')

#### ~ ~ ~ ####
<br>

In [None]:
# Store results: 
# v0.5 - 07/14: (TCs with null URL)
joblib.dump(df_2, './output/' + 'TCs_Approval-NULL_URL-Doc_Collection_2020-07-14_v05_.joblib' + '.bz2', compress=('bz2', 3))

In [None]:
df_2.to_excel('TCs_Approval-NULL_URL-Doc_Collection_2020-07-14_v05.xlsx')

#### ~ ~ ~ ####
<br>

In [None]:
# Store results: 
# v0.4 - 07/08
joblib.dump(df, './output/' + 'Approval_Documents_Collection_2020-07-08_v04_.joblib' + '.bz2', compress=('bz2', 3))


In [None]:
df.to_excel('Approval_Documents_Collection_2020-07-08_v04.xlsx')

#### ~ ~ ~ ####
<br>

# ******* ~ *** ~ * ~ *** ~ ******* #

<br>
<br>

###  Approval Documents w/NULL URL

TC's Approval Documents that presented a NULL URL, but there is a valid EzShare document reference. 
After filtering, <b>58 documents</b> were identified (07/14)

#### Data Preparation & Exploration

In [None]:
# Load source xlsx file:
data_ = pd.read_excel(r"./input/Data-01 July 2020.xlsx", sheet_name='data_filtered_2', encoding='latin1')

In [None]:
data_.head()

In [None]:
data_.shape

In [None]:
data_.columns

In [None]:
data_2 = data_[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 
       'CREATED', 'CREATED_BY', 'MODIFIED', 'MODIFIED_BY', 'PROJECT_NUMBER', 
     'APPROVAL_NUMBER', 'FUND', 'FK_COUNTRY_ID', 'FK_SUB_SECTOR_ID',
    'DOCUMENT_NAME', 'URL']].copy()

In [None]:
data_2.head()

In [None]:
# **************************************************************************************************************** #

<br>
Using the EzShare code, fill in the URL field by including the common idbdocs url to request as:
URL base: `"https://idbdocs.iadb.org/wsdocs/getDocument.aspx?DOCNUM="` + `"Codigo_EZSHARE"`

In [None]:
url_base = 'https://idbdocs.iadb.org/wsdocs/getDocument.aspx?DOCNUM='

# column is float due to NaNs
data_2.URL = data_2.URL.astype(str)

for index, row in data_2.iterrows():
    print('Processing index:', str(index))
    url = url_base + data_2['DOCUMENT_REFERENCE'][index]
    data_2.at[index, 'URL'] = url

In [None]:
data_2.URL[2]

In [None]:
# We will work with the filtered dataframe, and we'll add additional columns for storing the results:
df_2 = data_2.copy()

In [None]:
# for storing the document's name and its status, i.e.: 'downloaded', exception message and final URL
df_2['Document_Name'] = '' #
df_2['Document_Status'] = '' #
df_2['Document_URL'] = '' #

In [None]:
df_2.head()

In [None]:
# Destination folder setup: all files will be downloaded to 'file_dir_2'

desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir_2 = desktop_dir + "\\Approvals_NULLs"

print(file_dir_2)

#### Approval Documents w/NULL URL collection:

In [None]:
%%time

t = 1     # counter set

for index, row in df_2.iterrows():
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(df_2['URL'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_2, r, df_2['OPERATION_NUMBER'][index]) 
        df_2.at[index, 'Document_Name'] = file_name
        df_2.at[index, 'Document_Status'] = 'OK - direct download'
        df_2.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df_2.at[index, 'Document_Name'] = 'not downloaded'
            df_2.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df_2.at[index, 'Document_Name'] = 'not downloaded'
                            df_2.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_2, s, df_2['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df_2.at[index, 'Document_Name'] = file_name
                                df_2.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df_2.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df_2.at[index, 'Document_Name'] = 'Not downloaded'
                                df_2.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 7) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 7 docs
        print('')
    print('')

In [None]:
df_2[~(df_2['Document_Status'] == 'OK - download from redirect')]

In [None]:
# test not found OK:
df_2['URL'][12]

In [None]:
#(saved results under v05)

<br>
<br>
<br>

# Grants

#### Data Preparation & Exploration

In [None]:
# Load source xlsx file:
data_ = pd.read_excel(r"./input/Data-01 July 2020.xlsx", sheet_name='data_3_grants', encoding='latin1')

In [None]:
data_.head()

In [None]:
data_.shape

In [None]:
#### pre filtering: remove those with NaN value in the 'DOCUMENT_NAME' field:
data_ = data_[~data_['DOCUMENT_NAME'].isna()]

In [None]:
data_.columns

In [None]:
data_3 = data_[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 
       'CREATED', 'CREATED_BY', 'MODIFIED', 'MODIFIED_BY', 'PROJECT_NUMBER', 
     'APPROVAL_NUMBER', 'FUND', 'FK_COUNTRY_ID', 'FK_SUB_SECTOR_ID',
    'DOCUMENT_NAME', 'URL']].copy()

In [None]:
data_3.head()

In [None]:
# **************************************************************************************************************** #

In [None]:
# We will work with the filtered dataframe, and we'll add additional columns for storing the results:
df_3 = data_3.copy()

In [None]:
# for storing the document's name and its status, i.e.: 'downloaded', exception message and final URL
df_3['Document_Name'] = '' #
df_3['Document_Status'] = '' #
df_3['Document_URL'] = '' #

In [None]:
df_3.head()

In [None]:
# Destination folder setup: all files will be downloaded to 'file_dir_3'

desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir_3 = desktop_dir + "\\Grants_Approvals"

print(file_dir_3)

#### Grants collection:

In [None]:
%%time

t = 1     # counter set

for index, row in df_3.iterrows():
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(df_3['URL'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_3, r, df_3['OPERATION_NUMBER'][index]) 
        df_3.at[index, 'Document_Name'] = file_name
        df_3.at[index, 'Document_Status'] = 'OK - direct download'
        df_3.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df_3.at[index, 'Document_Name'] = 'not downloaded'
            df_3.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df_3.at[index, 'Document_Name'] = 'not downloaded'
                            df_3.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_3, s, df_3['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df_3.at[index, 'Document_Name'] = file_name
                                df_3.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df_3.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df_3.at[index, 'Document_Name'] = 'Not downloaded'
                                df_3.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 7) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 7 docs
        print('')
    print('')

In [None]:
df_3[~(df_3['Document_Status'] == 'OK - download from redirect')]

In [None]:
%%time

# 2nd run for indexes: [54, 57, 58]

t = 1     # counter set

for index in [54, 57, 58]:
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(df_3['URL'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_3, r, df_3['OPERATION_NUMBER'][index]) 
        df_3.at[index, 'Document_Name'] = file_name
        df_3.at[index, 'Document_Status'] = 'OK - direct download'
        df_3.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df_3.at[index, 'Document_Name'] = 'not downloaded'
            df_3.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df_3.at[index, 'Document_Name'] = 'not downloaded'
                            df_3.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_3, s, df_3['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df_3.at[index, 'Document_Name'] = file_name
                                df_3.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df_3.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df_3.at[index, 'Document_Name'] = 'Not downloaded'
                                df_3.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 7) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 7 docs
        print('')
    print('')

In [None]:
#(saved results under v0.6)

<br>
<br>
<br>

# Loans

#### Data Preparation & Exploration

In [None]:
# Load source xlsx file:
data_ = pd.read_excel(r"./input/Data-01 July 2020.xlsx", sheet_name='data_4_loans', encoding='latin1')

In [None]:
data_.shape

In [None]:
data_.columns

In [None]:
data_loans = data_[['FK_OPERATION_ID', 'OPERATION_NUMBER', 'DOCUMENT_ID', 'DOCUMENT_REFERENCE', 'DESCRIPTION', 
       'CREATED', 'CREATED_BY', 'MODIFIED', 'MODIFIED_BY', 'PROJECT_NUMBER', 
     'APPROVAL_NUMBER', 'FUND', 'FK_COUNTRY_ID', 'FK_SUB_SECTOR_ID',
    'DOCUMENT_NAME', 'URL']].copy()

In [None]:
data_loans.head()

In [None]:
# **************************************************************************************************************** #

In [None]:
# We will work with the filtered dataframe, and we'll add additional columns for storing the results:
df_4 = data_loans.copy()

In [None]:
# for storing the document's name and its status, i.e.: 'downloaded', exception message and final URL
df_4['Document_Name'] = '' #
df_4['Document_Status'] = '' #
df_4['Document_URL'] = '' #

In [None]:
df_4.head()

In [None]:
# Destination folder setup: all files will be downloaded to 'file_dir_4'

desktop_dir = "C:\\Users\\emilianoco\\Desktop"
file_dir_4 = desktop_dir + "\\Loans_Approvals"

print(file_dir_4)

#### Loans collection:

In [None]:
# we will use the EzShare code approach:
url_base

In [None]:
%%time

t = 1     # counter set

# Using idbdocs + EzShare 'DOCUMENT_REFERENCE': 

for index, row in df_4.iterrows():
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(url_base + df_4['DOCUMENT_REFERENCE'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_4, r, df_4['OPERATION_NUMBER'][index]) 
        df_4.at[index, 'Document_Name'] = file_name
        df_4.at[index, 'Document_Status'] = 'OK - direct download'
        df_4.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df_4.at[index, 'Document_Name'] = 'not downloaded'
            df_4.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df_4.at[index, 'Document_Name'] = 'not downloaded'
                            df_4.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_4, s, df_4['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df_4.at[index, 'Document_Name'] = file_name
                                df_4.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df_4.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df_4.at[index, 'Document_Name'] = 'Not downloaded'
                                df_4.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 7) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 7 docs
        print('')
    print('')

In [None]:
df_4[~(df_4['Document_Status'] == 'OK - download from redirect')]

In [None]:
pending_1 = df_4[(df_4['Document_Status'] == '')].index

In [None]:
%%time

# 2nd run for indexes in pending_1

t = 1     # counter set

# Using idbdocs + EzShare 'DOCUMENT_REFERENCE': 

for index in pending_1:
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(url_base + df_4['DOCUMENT_REFERENCE'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_4, r, df_4['OPERATION_NUMBER'][index]) 
        df_4.at[index, 'Document_Name'] = file_name
        df_4.at[index, 'Document_Status'] = 'OK - direct download'
        df_4.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df_4.at[index, 'Document_Name'] = 'not downloaded'
            df_4.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df_4.at[index, 'Document_Name'] = 'not downloaded'
                            df_4.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_4, s, df_4['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df_4.at[index, 'Document_Name'] = file_name
                                df_4.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df_4.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df_4.at[index, 'Document_Name'] = 'Not downloaded'
                                df_4.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 7) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 7 docs
        print('')
    print('')

In [None]:
pending_2 = df_4[(df_4['Document_Status'] == '')].index

In [None]:
pending_2

In [None]:
%%time

# 3rd run for indexes in pending_2

t = 1     # counter set

# Using idbdocs + EzShare 'DOCUMENT_REFERENCE': 

for index in pending_2:
    print('## processing index', str(index))
    
    checked_message, checked_url = url_check(url_base + df_4['DOCUMENT_REFERENCE'][index])
    print(checked_message, checked_url)
    r = requests.get(checked_url, headers = h_idbdocs, allow_redirects = True)

    if 'application/' in r.headers['Content-Type']:
        # download
        print('download document')
        file_name = download_file(file_dir_4, r, df_4['OPERATION_NUMBER'][index]) 
        df_4.at[index, 'Document_Name'] = file_name
        df_4.at[index, 'Document_Status'] = 'OK - direct download'
        df_4.at[index, 'Document_URL'] = r.url

    else: 
        status = check_content(r)
        if status in ['access denied','not found']:
            print(status)
            print('save result and break')
            df_4.at[index, 'Document_Name'] = 'not downloaded'
            df_4.at[index, 'Document_Status'] = status
        
        else:
            print('continue')
            print('... checking request.history[i].url ...')
            for i in range(len(r.history)):      # cross-site authentication control
                if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                    location = r.history[i].url  # effective URL after the redirects
                    print(location)
                    
                    if 'sharepoint' in location:  # connect using sharepoint headers and cookie
                        s = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                        status = check_content(s)
                        if status in ['access denied','not found']:
                            print('***')
                            print(status)
                            print('save result and break')
                            df_4.at[index, 'Document_Name'] = 'not downloaded'
                            df_4.at[index, 'Document_Status'] = status
                            print('***')
                            
                        else: 
                            print('try downloading from', location)
                            try:
                                file_name = download_file(file_dir_4, s, df_4['OPERATION_NUMBER'][index])             # download the document and get the filename
                                df_4.at[index, 'Document_Name'] = file_name
                                df_4.at[index, 'Document_Status'] = 'OK - download from redirect'
                                df_4.at[index, 'Document_URL'] = s.url
                                print('Downloaded!')
                                #count =+ 1 
                                break
                                
                            except Exception as e: 
                                df_4.at[index, 'Document_Name'] = 'Not downloaded'
                                df_4.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                            
                    
                    else: 
                        print('\'sharepoint\' not found in url', location)
     
                    del location
    print('##')
    print('')
    t = t + 1
    if (t % 7) == 0:
        print('')
        print("* * 2 seconds pause")
        time.sleep(2) # 2 sec pause inserted every 7 docs
        print('')
    print('')

# **************************************************************************************************************** #

In [None]:
### PREVIOUS scraper - NOT TO BE USED

%%time

### Run 07/03-v_updated to correct idbdocs/sharepoint response on not found documents [latest version (06/18/2020) and variables dataframe df_2 and file_dir_2 - added .strip() to the URL parsing]

t = 1     # counter set

for index in [1187, 1223]:
#for index in to_redownload:
    print("Processing index: ", str(index))
    if df_2.URL[index] != '':
    
        print(pcr_validate(df_2['URL'][index].strip()))
    
        if pcr_validate(df_2['URL'][index].strip())[0] == 'sharepoint_directly':

            test = pcr_validate(df_2['URL'][index].strip())[1].strip()

            r = requests.get(test, headers = h_sharepoint, allow_redirects = True)
            #print('paso request')

            if file_extension.search(test): #check if extension is present
                file_name = download_file(file_dir_2, r)             # download the document and get the filename
            else:
                file_name = download_file_2(file_dir_2, r)             # download the document and get the filename

            print('Downloading file: ' + file_name) 
            df_2.at[index, 'Document_Name'] = file_name
            df_2.at[index, 'Document_Status'] = 'OK'
    
        elif pcr_validate(df_2['URL'][index].strip())[0] == 'idbdocs_directly':

            test = pcr_validate(df_2['URL'][index].strip())[1].strip()

            r = requests.get(test, headers = h_idbdocs, allow_redirects = True)
            
            if 'could not be found in Sharepoint EzShare' in str(r.content):  # document not found
                df_2.at[index, 'Document_Name'] = 'Not found'
                df_2.at[index, 'Document_Status'] = 'Not found'
                print('Not Downloaded! - document not found')
                
            
            elif 'Content-Disposition' in r.headers: 
                file_name = download_file_2(file_dir_2, r)             # download the document and get the filename
                print('Downloading file: ' + file_name) 
                df_2.at[index, 'Document_Name'] = file_name
                df_2.at[index, 'Document_Status'] = 'OK'

            else:
                for i in range(len(r.history)):      # cross-site authentication control
                    if bool(re.search(r'\.[a-z]{3}[a-z]?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)):
                        location = r.history[i].url  # effective URL after the redirects
                        print(location)


                        r = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        file_name = download_file(file_dir_2, r)             # download the document and get the filename
                        print('Downloading file: ' + file_name) 
                        df_2.at[index, 'Document_Name'] = file_name
                        df_2.at[index, 'Document_Status'] = 'OK - redirected'
                        print('Downloaded idbdocs redirected!')
                        break

        else: # 'connect_twice'

            test = pcr_validate(df_2['URL'][index].strip())[1].strip()

            r = requests.get(test, headers = h_idbdocs, allow_redirects = True)

            if 'could not be found in Sharepoint EzShare' in str(r.content):  # document not found
                df_2.at[index, 'Document_Name'] = 'Not found'
                df_2.at[index, 'Document_Status'] = 'Not found'
                print('Not Downloaded! - document not found')
                
            
            elif 'Content-Disposition' in r.headers: 
                file_name = download_file_2(file_dir_2, r)             # download the document and get the filename
                print('Downloading file: ' + file_name) 
                df_2.at[index, 'Document_Name'] = file_name
                df_2.at[index, 'Document_Status'] = 'OK'

            else:
                for i in range(len(r.history)):      # cross-site authentication control
                    if bool(re.search(r'\.[a-z]{3}[a-z]?(\?d\=[a-z0-9]+)?$',r.history[i].request.url.lower())) and not ('Authenticate.aspx' in str(r.history[i].request.url)): # added control when url ends in ?d=...
                        location = r.history[i].url  # effective URL after the redirects
                        print(location)


                        r = requests.get(location, headers = h_sharepoint, allow_redirects = True)
                        
                                                
                        if ('AccessDenied.aspx' in str(r.content)) or ('does not have permissions to access this resource' in str(r.content)): # access denied!!! 
                            df_2.at[index, 'Document_Name'] = 'Not downloaded'
                            df_2.at[index, 'Document_Status'] = 'Not downloaded'
                            print('Not Downloaded! - access restricted')
                            break
                        
                        else: 
                            try:
                                file_name = download_file(file_dir_2, r)             # download the document and get the filename
                                print('Downloading file: ' + file_name) 
                                df_2.at[index, 'Document_Name'] = file_name
                                df_2.at[index, 'Document_Status'] = 'OK - redirected'
                                print('Downloaded!')
                                i = i + 1
                                break
                            
                            except Exception as e: 
                                df_2.at[index, 'Document_Name'] = 'Not downloaded'
                                df_2.at[index, 'Document_Status'] = str(e)
                                print('Not downloaded: '+ str(e))
                    
        print('')
        print('Processed document: ', str(t))                
        t = t + 1
        print('')
        print('-----------------------------')
        
        if (t % 6) == 0:
            print('')
            print("*** 3 seconds pause")
            time.sleep(3) # 3 sec pause inserted every 6 docs
            print('')
            print('')

# **************************************************************************************************************** #

In [None]:
## FIN

In [None]:
#'''
# ******************************************************************************************************************** #
# *************************************************  Version Control  ************************************************ #
# ******************************************************************************************************************** #
  
#   Version:            Date:                User:                    Change:                                          #

#   - 0.7           07/15/2020        Emiliano Colina       - Loans document type added
#

#   - 0.6           07/14/2020        Emiliano Colina       - Grants document type added to the data collection process
#

#   - 0.5           07/14/2020        Emiliano Colina       - Added TCs that had 'NULL' value in the URL field, but 
#                                                           an EzShare code was present

#   - 0.4           07/07/2020        Emiliano Colina       - Updated functions for data collection process
#                                                           

#   - 0.3           07/03/2020        Emiliano Colina       - Re-run data collection process on document list having 
#                                                           07/01 as cut-off date

#   - 0.2           06/18/2020        Emiliano Colina       - Included TCs 'Approval Documents' type
#                                                            

#   - 0.1           06/16/2020        Emiliano Colina       - Initial version, started with 'Approval Registry' type
#                                                            

#
# ******************************************************************************************************************** #
#'''
