In [82]:
import json
import requests
from bs4 import BeautifulSoup
from time import sleep
import urllib.request
from urllib.request import Request
from contextlib import closing
import shutil
import os
from mimetypes import guess_extension
import random
import re
from requests import exceptions
import textract
import string
import pandas as pd
import numpy as np

# Import a Nightly FBO File

In [25]:
with open('nightly_files/fbo_nightly_20180506.json') as f:
    json_str = json.load(f)
    json_data = json.loads(json_str)

# Get Solicitation Attachments
Each notice has a link to its fbo url. Some of those fbo pages have links to solicattion documents. In this section, we'll scrape each notice's page to find and download those solicitation docs.

### TO-DO
 - Ensure that all attachments on fbo pages are found in `<div> class = 'notice_attachment_ro' </div>`

In [26]:
def write_file(attachment_url, file_name, out_path):
    '''
    Void function that, given a url to an attachment, downloads and writes it.
    
    Arguments:
        attachment_url (str): the url of the document
        file_name (str): what you'd like to save that document as
        out_path (str): where you'd like to save that document
        
    Returns:
        None
    '''

    r = requests.get(attachment_url, timeout=10)
    if '/utils/view?id' in attachment_url:
        content_type = r.headers['Content-Type']
        if content_type == 'application/msword':
            extension = '.rtf'
        else:
            extension = guess_extension(content_type.split()[0].rstrip(";"))
        # no extensions found for 'application/vnd.openxmlformats-o' content-type
        if not extension:
            extension = '.docx'
        file_name = os.path.join(out_path, file_name+extension)
    elif 'ftp://' in attachment_url:
        with closing(urllib.request.urlopen(attachment_url)) as r:
            file_name = os.path.join(out_path, file_name)
            with open(file_name, 'wb') as f:
                shutil.copyfileobj(r, f)
    else:
        file_name = os.path.join(out_path, file_name)
    with open(file_name, mode='wb') as f:
        f.write(r.content)

In [27]:
def get_fbo_attachments(json_data):
    '''
    Void function that, given json for a single nightly FBO file, visits each notice's
    fbo url, finds solicitation documents, and writes them to disk.
    
    Arguments:
        json_data (dict): a dict reprenting json data, ideally as the result of:
        ```
        with open('fbo_nightly_20180506.json') as f:
            json_str = json.load(f)
            json_data = json.loads(json_str)
        ```
    '''
    
    for k in json_data:
        for notice in json_data[k]:
            try:
                fbo_url = notice['url']
            except:
                continue
            r = requests.get(fbo_url)
            r_text = r.text
            soup = BeautifulSoup(r_text, "html.parser")
            attachment_divs = soup.find_all('div', {"class": "notice_attachment_ro"})
            notice['attachments'] = {'url':[], 'text': []}
            for i,d in enumerate(attachment_divs):
                    attachment_href = d.find('a')['href']
                    if '/utils/view?id' in attachment_href:
                        attachment_url = 'https://fbo.gov'+attachment_href
                    else:
                        attachment_url = attachment_href
                    notice['attachments']['url'].append(attachment_url)
                    file_name = os.path.basename(attachment_url)
                    out_path = os.path.join(os.getcwd(),"attachments")
                    if not os.path.exists(out_path):
                        os.makedirs(out_path)
                    try:
                        write_file(attachment_url, file_name, out_path)
                    except exceptions.Timeout:
                        print(f"Connection timed out after 10 seconds.\n\t Perhaps inspect:  {attachment_url}")
                    except Exception as e:
                        print(f"Failed:  {e}.\n\t Perhaps inspect:  {attachment_url}")

In [28]:
get_fbo_attachments(json_data)

# Extract Text from Attachments

In [80]:
def get_attachment_text(attachments_path):
    '''
    Void function that extracts and writes to a new dir the text from all the files in `attachments_path`.
    
    Arguments:
        attachments_path(str): the directory name where the FBO attachments are located.
        
    Returns:
        None
    '''
    
    out_path = 'attachment_texts'
    if not os.path.exists(out_path):
        os.makedirs(out_path)

    for file in os.listdir(attachments_path):
        if file.startswith('.'):
            continue
        else:
            file_path = os.path.join(attachments_path,file)
            try:
                b_text = textract.process(file_path, encoding='utf-8')
                detected_encoding = chardet.detect(b_text)['encoding']
                text = b_text.decode(detected_encoding)
                base = os.path.splitext(file)[0]
                out_file = base+'.txt'
                out = os.path.join(out_path, out_file)
                with open(out, 'w') as f:
                    f.write(text)
            except Exception as e:
                print("-"*80)
                print(e)
                print(file)

In [81]:
get_attachment_text('attachments')

# Import Unlabeled Texts

In [83]:
def create_unlabeled_df(unlabeled_data_path):
    '''
    Create a pandas DataFrame with the unlabled attachment texts.
    
    Arguments:
        unlabeled_data_path (str): the directory for the attachment text files.
        
    Returns:
        unlabeled_df (pandas DataFrame): a dataframe with a column for the file name, 
                                         the text, and the label, which is np.nan by default.
    '''
    
    texts = []
    files = []
    labels = []
    for file in os.listdir(unlabeled_data_path):
        if file.startswith('.'):
            continue
        else:
            files.append(file)
            labels.append(np.nan)
            file_path = os.path.join(unlabeled_data_path, file)
            #use latin1 encoding to be generous with decoding
            with open(file_path, 'r', encoding='latin1') as f:
                text = f.read()
                texts.append(text)
    unlabeled_df = pd.DataFrame(data=[files,texts,labels]).transpose()
    unlabeled_df.columns = ['file','text','label']
    return unlabeled_df

In [84]:
unlabeled_df = create_unlabeled_df('attachment_texts')

In [88]:
# the rows with nothing in the text column are likely pdf images, which are obviously not 508 compliant
unlabeled_df.head()

Unnamed: 0,file,text,label
0,DocumentServer.aspx?DocumentId=4281869&FileNam...,FedBizOpps\n\nPresolicitation Notice\n\n*\n\n*...,
1,N40192-18-P-0022_SSJ.txt,,
2,N40192-18-P-0023_SSJBUS.txt,,
3,N4019218P0016_SSJBUS.txt,,
4,view?id=0ab4be24e7d05feb8d7d1d34d50e8bff.txt,Vendor 1\n\n\n\nIs there a requirement for thi...,


# Import Labeled Texts

In [77]:
def create_labeled_df(labeled_data_path):
    '''
    Create a pandas DataFrame with the labled attachment texts.
    
    Arguments:
        labeled_data_path (str): the directory for the labeled attachment text files.
        
    Returns:
        labeled_df (pandas DataFrame): a dataframe with a column for the file name, 
                                       the text, and the label (green, yellow or red).
    '''
    
    texts = []
    files = []
    labels = []
    for file in os.listdir(labeled_data_path):
        if file.startswith('.'):
            continue
        else:
            files.append(file)
            label = file.split('_')[0]
            labels.append(label)
            file_path = os.path.join(labeled_data_path,file)
            #use latin1 encoding to be generous with decoding
            with open(file_path, 'r', encoding='latin1') as f:
                text = f.read()
                texts.append(text)
    labeled_df = pd.DataFrame(data=[files,texts,labels]).transpose()
    labeled_df.columns = ['file','text','label']
    return labeled_df

In [78]:
labeled_df = create_labeled_df('labeled_fbo_docs')

In [89]:
labeled_df.head()

Unnamed: 0,file,text,label
0,GREEN_10-223-SOL-00051.txt,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",GREEN
1,GREEN_1055518.txt,\nStatement of Work -\n ...,GREEN
2,GREEN_1055521.txt,\n\nStatement of Work:\n\n1.0 BACKGROUND\nFD...,GREEN
3,GREEN_1057498.txt,\nAttachment A- High availability equipment (T...,GREEN
4,GREEN_105787.txt,"<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 T...",GREEN
