In [35]:
import pandas as pd
import re

In [36]:
df = pd.read_csv('../../data/dataset_from_json_v2.csv')
df.head()

Unnamed: 0,words,sentence #,tag
0,BRIDGESTONE,0,O
1,BRAND,0,O
2,TIRE(S),0,GoodsDescription
3,AND,0,GoodsDescription
4,O-RING(S),0,GoodsDescription


In [37]:
df.iloc[115,:]

words         GREY
sentence #       5
tag              O
Name: 115, dtype: object

In [38]:
combine = lambda x: ' '.join(x['words'])

In [61]:
words = df.groupby(df['sentence #']).apply(combine)
words[0:10]

sentence #
0    BRIDGESTONE BRAND TIRE(S) AND O-RING(S) AS PER...
1    BRIDGESTONE BRAND TIRE(S) AND O-RING(S) AS PER...
2    BRIDGESTONE BRAND TIRE(S) AND O-RING(S). AS PE...
3    LEATHER MEN S AND/OR LEATHER LADIES GARMENTS U...
4    LEATHER MEN S AND/OR LEATHER LADIES GARMENTS U...
5    QUANTITY <QUANTITY> M(+/-10 O/O) DESCRIPTION P...
6    QUANTITY <QUANTITY> M(+/-10 O/O) DESCRIPTION P...
7    TOYS AS PER PROFORMA INVOICE NO TSO1801008 ACC...
8    TOYS AS PER PROFORMA INVOICE NO TSO1801008 ACC...
9    <QUANTITY> Boxes of Tennis Sneakers marked Djo...
dtype: object

In [40]:
# example
words[5]

'QUANTITY 90000 M(+/-10 O/O) DESCRIPTION POLIESTER SATIN FABRIC CODE 42250 GREY A7 GRADE PO N07929/10 CIF INCOTERMS 2010'

In [41]:
find_special = lambda x: '58/60' in x #'CFR KARACHI PORT/ PORT QASIM TOTAL QUANTITY 21 MT PARAFFIN WAX 58/60 FULLY REFINED AT USD 950.00 PER M/TON AS PER INDENT NO. SBR/DEPT.C/581/2019 DATED 19.11.19 OD M/S S.B.R. AND CO. KARACHI PAKISTAN'
find_special = lambda x: '950.00' in x 

specials = words[words.apply(find_special)]
specials

Series([], dtype: object)

In [42]:
# we need \S instead of \w because we have dates: 31.10.2019 which need to be extracted as a single entity

# EXAMPLE
# extract_containing_numbers = lambda x: re.findall(r'\d+\S+', x)
# extract_containing_numbers
# words.apply(extract_containing_numbers).tolist()

In [62]:
#extract the NUMBERS, INCOTERMS and DATES for better tokenisation
extract_numbers = lambda x: re.findall(r'\d+\S+', x)
list_words = words.apply(extract_numbers).tolist()
flat_list = [el for l in list_words for el in l]
flat_list[0:5]

['8ACA021X', '8ACA021X', '201', '8ACA021X', '201']

In [44]:
words
words.apply(extract_numbers).tolist()
len(flat_list)

235

In [45]:
numbers = set(flat_list)
numbers

# we now need to remove ONLY the . at the end
numbers = [n[:-1] if n[-1] == '.' else n for n in list(numbers)]
numbers[0:5]

['997', '10/MT', '794', '56', '77000/MT']

Now we need to classify them in 3 categories: 
1. \<number\> for Quantity and UnitPriceAmount example 2000
2. \<ID\> for INCOTERMS example 2019H29Q2
3. \<DATE\> for improved tokenisation

Why? We have a new date everyday, and if we include every day as another date in the vocabulary we might think eroneously that we have a large vocabulary, when in fact all we have is a lot of numbers, ids and dates.

In [46]:
nrs = pd.Series(numbers) #convert to Series for ease of use
nrs.head(), nrs.shape

(0         997
 1       10/MT
 2         794
 3          56
 4    77000/MT
 dtype: object,
 (156,))

I am speculating that

1. Quantity contains only Integer values: 1000, 200, 7000
2. UnitPriceAmount contains Float values: 250.00, 293.11
3. Dates have a specific type: xx/xx/xxxx or xx-xx-xxxx
4. Everything else is Incoterms

In [47]:
# Quantity
def convert_to_int(x):    
    try:
        int(x)
        return True
    except ValueError:
        return False

quantities = nrs.apply(convert_to_int)

In [48]:
# UnitPriceAmount
def convert_to_float(x):    
    try:
        if not convert_to_int(x):
            float(x)
        else:
            return False

        return True
    except ValueError:
        return False

units = nrs.apply(convert_to_float)

In [49]:
# Dates
def convert_to_ids(x):    
    try:
        is_date = re.findall('^\d{2}(\.|\/)\d{2}(\.|\/)\d{4}$', x)
        if len(is_date) == 0:
            return False
        else:
            return True
    except ValueError:
        return False

dates = nrs.apply(convert_to_ids)

In [50]:
nrs[dates]

14    04/12/2020
dtype: object

In [51]:
# the rest are incoterms
incoterms = ~quantities & ~units & ~dates
incoterms[0:5]

0    False
1     True
2    False
3    False
4     True
dtype: bool

In [52]:
quantities.sum() + units.sum() + dates.sum() + incoterms.sum(), nrs.shape[0]

(156, 156)

In [53]:
nrs[quantities].tolist()[0:5]

['997', '794', '56', '796', '1461']

In [54]:
dictionary = {}

def append_to_dict(key, value):
    dictionary[key] = value


nrs[quantities].apply(append_to_dict, args=('<QUANTITY>',))
nrs[units].apply(append_to_dict, args=('<UNITPRICEAMOUNT>',))
nrs[dates].apply(append_to_dict, args=('<DATE>',))
nrs[incoterms].apply(append_to_dict, args=('<INCOTERMS>',));

In [55]:
dictionary

{'997': '<QUANTITY>',
 '794': '<QUANTITY>',
 '56': '<QUANTITY>',
 '796': '<QUANTITY>',
 '1461': '<QUANTITY>',
 '23': '<QUANTITY>',
 '518': '<QUANTITY>',
 '00': '<QUANTITY>',
 '2000': '<QUANTITY>',
 '06': '<QUANTITY>',
 '010131': '<QUANTITY>',
 '16': '<QUANTITY>',
 '374340000': '<QUANTITY>',
 '45': '<QUANTITY>',
 '3358': '<QUANTITY>',
 '392860': '<QUANTITY>',
 '26': '<QUANTITY>',
 '25': '<QUANTITY>',
 '2014': '<QUANTITY>',
 '60': '<QUANTITY>',
 '85': '<QUANTITY>',
 '2050': '<QUANTITY>',
 '506': '<QUANTITY>',
 '112': '<QUANTITY>',
 '115168': '<QUANTITY>',
 '00122': '<QUANTITY>',
 '788': '<QUANTITY>',
 '173': '<QUANTITY>',
 '555': '<QUANTITY>',
 '515': '<QUANTITY>',
 '6300': '<QUANTITY>',
 '787': '<QUANTITY>',
 '446': '<QUANTITY>',
 '2010': '<QUANTITY>',
 '1123': '<QUANTITY>',
 '396': '<QUANTITY>',
 '3000': '<QUANTITY>',
 '5800': '<QUANTITY>',
 '100': '<QUANTITY>',
 '1801008': '<QUANTITY>',
 '2017': '<QUANTITY>',
 '985': '<QUANTITY>',
 '000': '<QUANTITY>',
 '13032014': '<QUANTITY>',
 '508

In [56]:
# now we need to map through the dataset and change the values.

def mapping(word):
    if word in dictionary.keys():
        return dictionary[word]
    return word

df['new_mappings'] = df['words'].apply(mapping)

In [57]:
df[(df['words'] != df['new_mappings'])]

Unnamed: 0,words,sentence #,tag,new_mappings
15,22,0,O,<QUANTITY>
17,2017,0,O,<QUANTITY>
34,22,1,O,<QUANTITY>
36,2017,1,O,<QUANTITY>
52,22,2,O,<QUANTITY>
...,...,...,...,...
1478,788,39,O,<QUANTITY>
1479,790,39,O,<QUANTITY>
1480,792,39,O,<QUANTITY>
1482,17/07/17,39,O,<INCOTERMS>


In [58]:
del df['words']

In [59]:
df = df.rename(columns={'new_mappings':'words'})
df

Unnamed: 0,sentence #,tag,words
0,0,O,BRIDGESTONE
1,0,O,BRAND
2,0,GoodsDescription,TIRE(S)
3,0,GoodsDescription,AND
4,0,GoodsDescription,O-RING(S)
...,...,...,...
1483,39,Incoterms,HONG
1484,39,Incoterms,KONG
1485,39,O,ORDER
1486,39,O,NF


In [60]:
cols = ['words', 'sentence #', 'tag']
df[cols].to_csv('../../data/dataset_from_json_smaller_vocab_v2.csv')