In [53]:
# IMPORTS

import pandas as pd
import numpy as np
import spacy

In [54]:
# CONSTANTS

TRAIN_DATA = './data/train_data.csv'
TRAIN_LABEL = './data/train_labels.csv'

TEST_DATA = './data/test_data.csv'
TEST_LABEL = './data/test_labels.csv'

In [55]:
# CONFIG

class CONFIG:

    load_pickle = True
    
    is_toy = True
    toy_size = 20
    epochs = 20
    validation_split = 0.2

In [56]:
def load_spacy():
    print('INFO - Loading Spacy, may take a while')
    nlp_spacy = spacy.load("en_core_web_md")
    print('INFO - Loading Spacy complete')
    print(spacy.info())
    return nlp_spacy

nlp_spacy = None

if nlp_spacy is None:
    nlp_spacy = load_spacy()
else:
    print('INFO - Spacy already in place.')
    print(spacy.info())

INFO - Loading Spacy, may take a while
INFO - Loading Spacy complete
[1m

spaCy version    2.1.8                         
Location         /opt/conda/lib/python3.7/site-packages/spacy
Platform         Linux-4.9.184-linuxkit-x86_64-with-debian-buster-sid
Python version   3.7.3                         
Models                                         

{'spaCy version': '2.1.8', 'Location': '/opt/conda/lib/python3.7/site-packages/spacy', 'Platform': 'Linux-4.9.184-linuxkit-x86_64-with-debian-buster-sid', 'Python version': '3.7.3', 'Models': ''}


In [86]:
# DATA LOAD

def data_load_files():
    
    print('INFO -- Loading data files')
    train_label_df = pd.read_csv(TRAIN_LABEL, usecols=['document_name','is_fitara'])
    train_data_df = pd.read_csv(TRAIN_DATA)
    
    test_label_df = pd.read_csv(TEST_LABEL, usecols=['document_name','is_fitara'])
    test_data_df = pd.read_csv(TEST_DATA)
    
    if CONFIG.is_toy:
        train_label_df = train_label_df.sample(n=CONFIG.toy_size)
        train_data_df = train_data_df[train_data_df.document_name.isin(train_label_df.document_name.tolist())]
    
    return train_data_df, train_label_df, test_data_df, test_label_df

def data_merge(data_df ,label_df):
    
    print('INFO -- merge data and label dfs')
    df  = pd.merge (
        label_df, 
        data_df, 
        on = 'document_name', 
        how = 'inner'
    )
    df.head()
    
    return df

def data_pre_processing(text):

    print("INFO - processing - " + text[:4])
    texts = []
    doc = nlp_spacy(text, disable=['parser', 'ner'])
    tokens = [token for token in doc if not token.is_stop]
    tokens = [token for token in tokens if not token.is_punct]
    tokens = [token.lemma_.lower().strip() for token in tokens if token.lemma_ != '-PRON-']
    tokens = ' '.join(tokens)
#     print(tokens)
    texts.append(tokens)
    return pd.Series([texts, len(tokens)])



def data_print_stats(df):
    
    print('INFO -- shape', df.shape)
    print('INFO -- describe')
    print(df.describe())
    
def data_get_train_df(df):
    result = df['text'].apply(lambda x: data_pre_processing(x))
    df['processed_text'], df['wc'] = result[0], result[1]
    df['processed_text'] = df.apply(lambda row: row.processed_text[0], axis = 1) 
    return df


In [87]:
train_data_df, train_label_df, test_data_df, test_label_df = data_load_files()

train_df = data_merge(train_data_df, train_label_df)
test_df = test_data_df

# data_print_stats(train_df)
# data_print_stats(test_df)

INFO -- Loading data files
INFO -- merge data and label dfs


In [88]:
train_df = data_get_train_df(train_df)

INFO - processing - AMEN
INFO - processing - Limi
INFO - processing - WD 1
INFO - processing - PART
INFO - processing - STAT
INFO - processing - 52.2
INFO - processing - - 1 
INFO - processing - The 
INFO - processing - STAT
INFO - processing - - 1 
INFO - processing - 52.2
INFO - processing - Depa
INFO - processing - 52.2
INFO - processing - Ofﬁc
INFO - processing - PART
INFO - processing - 5127
INFO - processing - 52.2
INFO - processing - Stat
INFO - processing - PART
INFO - processing - 52.2


In [89]:
train_df.head()

Unnamed: 0,document_name,is_fitara,text,processed_text,wc
0,DHHS-NIHAO2016038-amd3.pdf,Yes,AMENDMENT OF SOLICITATION / MODIFICATION OF \n...,amendment solicitation modification contract ...,759107
1,1200041I_Limited_Source_Justification.pdf,Yes,"Limited Source Justiﬁcation\n\n \n\n \n\n""Sour...",limited source justiﬁcation source selection ...,5652
2,Wage_Determination_2015-2103.pdf,No,WD 15-2103 (Rev.-2) was first posted on www.wd...,wd 15 2103 rev.-2 post www.wdol.gov 01/05/2016...,22719
3,FAR_52-212-5.pdf,No,PART 52 Solicitation Provisinns and Comract Cl...,52 solicitation provisinns comract clause page...,13965
4,SOW_ServiceContract_AirPollutionSystem_NOIRMLC...,No,STATEMENT OF WORK Envitech Air Pollution Contr...,statement work envitech air pollution control ...,2656


In [90]:
data_print_stats(train_df)

INFO -- shape (20, 5)
INFO -- describe
                  wc
count      20.000000
mean    77066.450000
std    175351.578623
min       542.000000
25%      6036.000000
50%     13725.000000
75%     40504.250000
max    759107.000000
