In [113]:
import pandas as pd
import re

In [114]:
df = pd.read_csv('../../data/dataset_from_json_v2.csv')
df.head()

Unnamed: 0,words,sentence #,tag
0,BRIDGESTONE,0,O
1,BRAND,0,O
2,TIRE(S),0,GoodsDescription
3,AND,0,GoodsDescription
4,O-RING(S),0,GoodsDescription


In [115]:
df.iloc[115,:]

words         GREY
sentence #       5
tag              O
Name: 115, dtype: object

In [116]:
combine = lambda x: ' '.join(x['words'])

In [117]:
words = df.groupby(df['sentence #']).apply(combine)
words

sentence #
0     BRIDGESTONE BRAND TIRE(S) AND O-RING(S) AS PER...
1     BRIDGESTONE BRAND TIRE(S) AND O-RING(S) AS PER...
2     BRIDGESTONE BRAND TIRE(S) AND O-RING(S). AS PE...
3     LEATHER MEN S AND/OR LEATHER LADIES GARMENTS U...
4     LEATHER MEN S AND/OR LEATHER LADIES GARMENTS U...
5     QUANTITY 90000 M(+/-10 O/O) DESCRIPTION POLIES...
6     QUANTITY 90000 M(+/-10 O/O) DESCRIPTION POLIES...
7     TOYS AS PER PROFORMA INVOICE NO TSO1801008 ACC...
8     TOYS AS PER PROFORMA INVOICE NO TSO1801008 ACC...
9         1123 Boxes of Tennis Sneakers marked Djoko 23
10    COPPER CLAD LAMINATE AS PER PROFORMA INVOICE N...
11    MULTIPURPOSE CLOTH 38X38CM 392860 PINK 141 00P...
12    PRODUCT HYDROUS ETHANOL QUANTITY 2 500000 METR...
13    COMMODITY CUMENE QUANTITY 1500MT PRICE TERM US...
14    FOB DJIBOUTI / ETHIOPIA PORT QTY 363 MT (15 X ...
15    GARMENTS AND/OR ACCESSORIES (VESTUARIO E/OU AC...
16    TELECOMMUNICATION EQUIPMENT AS PER PROFORMA IN...
17    (PLUS OR MINUS 5 PCT) 500 MT OF

In [118]:
# example
words[5]

'QUANTITY 90000 M(+/-10 O/O) DESCRIPTION POLIESTER SATIN FABRIC CODE 42250 GREY A7 GRADE PO N07929/10 CIF INCOTERMS 2010'

In [119]:
find_special = lambda x: '58/60' in x #'CFR KARACHI PORT/ PORT QASIM TOTAL QUANTITY 21 MT PARAFFIN WAX 58/60 FULLY REFINED AT USD 950.00 PER M/TON AS PER INDENT NO. SBR/DEPT.C/581/2019 DATED 19.11.19 OD M/S S.B.R. AND CO. KARACHI PAKISTAN'
find_special = lambda x: '950.00' in x 

specials = words[words.apply(find_special)]
specials

Series([], dtype: object)

In [120]:
# we need \S instead of \w because we have dates: 31.10.2019 which need to be extracted as a single entity

# EXAMPLE
# extract_containing_numbers = lambda x: re.findall(r'\d+\S+', x)
# extract_containing_numbers
# words.apply(extract_containing_numbers).tolist()

In [121]:
#extract the NUMBERS, INCOTERMS and DATES for better tokenisation
extract_numbers = lambda x: re.findall(r'\d+\S+', x)
list_words = words.apply(extract_numbers).tolist()
flat_list = [el for l in list_words for el in l]

In [122]:
words
words.apply(extract_numbers).tolist()
len(flat_list)

235

In [123]:
numbers = set(flat_list)
numbers

# we now need to remove ONLY the . at the end
numbers = [n[:-1] if n[-1] == '.' else n for n in list(numbers)]

Now we need to classify them in 3 categories: 
1. \<number\> for Quantity and UnitPriceAmount example 2000
2. \<ID\> for INCOTERMS example 2019H29Q2
3. \<DATE\> for improved tokenisation

Why? We have a new date everyday, and if we include every day as another date in the vocabulary we might think eroneously that we have a large vocabulary, when in fact all we have is a lot of numbers, ids and dates.

In [124]:
nrs = pd.Series(numbers) #convert to Series for ease of use
nrs.head(), nrs.shape

(0       56
 1    00M/T
 2     3000
 3     1461
 4    1N046
 dtype: object,
 (156,))

I am speculating that

1. Quantity contains only Integer values: 1000, 200, 7000
2. UnitPriceAmount contains Float values: 250.00, 293.11
3. Dates have a specific type: xx/xx/xxxx or xx-xx-xxxx
4. Everything else is Incoterms

In [125]:
# Quantity
def convert_to_int(x):    
    try:
        int(x)
        return True
    except ValueError:
        return False

quantities = nrs.apply(convert_to_int)

In [126]:
# UnitPriceAmount
def convert_to_float(x):    
    try:
        if not convert_to_int(x):
            float(x)
        else:
            return False

        return True
    except ValueError:
        return False

units = nrs.apply(convert_to_float)

In [127]:
# Dates
def convert_to_ids(x):    
    try:
        is_date = re.findall('^\d{2}(\.|\/)\d{2}(\.|\/)\d{4}$', x)
        if len(is_date) == 0:
            return False
        else:
            return True
    except ValueError:
        return False

dates = nrs.apply(convert_to_ids)

In [128]:
nrs[dates]

76    04/12/2020
dtype: object

In [129]:
# the rest are incoterms
incoterms = ~quantities & ~units & ~dates
incoterms

0      False
1       True
2      False
3      False
4       True
       ...  
151    False
152     True
153    False
154    False
155    False
Length: 156, dtype: bool

In [130]:
quantities.sum() + units.sum() + dates.sum() + incoterms.sum(), nrs.shape[0]

(156, 156)

In [131]:
nrs[quantities].tolist()

['56',
 '3000',
 '1461',
 '281819',
 '80',
 '3450',
 '26',
 '141613456',
 '517',
 '392860',
 '15',
 '20',
 '4850',
 '18',
 '446',
 '115168',
 '60',
 '514',
 '67',
 '505',
 '500',
 '00',
 '2014',
 '110164',
 '85',
 '787',
 '141',
 '4571',
 '363',
 '200000',
 '04',
 '13032014',
 '3358',
 '65537',
 '90000',
 '201',
 '790',
 '96',
 '796',
 '6300',
 '42250',
 '508',
 '2807',
 '11012014',
 '201801012',
 '000',
 '997',
 '510',
 '9850',
 '5800',
 '16',
 '2050',
 '985',
 '173',
 '788',
 '500000',
 '100',
 '45',
 '1345',
 '392880',
 '515',
 '1123',
 '374340000',
 '2000',
 '202',
 '99589',
 '792',
 '396',
 '2021',
 '794',
 '10000103',
 '392890',
 '112',
 '06',
 '506',
 '22',
 '25',
 '10',
 '200',
 '010131',
 '141613462',
 '50',
 '288',
 '1801008',
 '23',
 '555',
 '00122',
 '518',
 '2017',
 '2010']

In [132]:
dictionary = {}

def append_to_dict(key, value):
    dictionary[key] = value


nrs[quantities].apply(append_to_dict, args=('<QUANTITY>',))
nrs[units].apply(append_to_dict, args=('<UNITPRICEAMOUNT>',))
nrs[dates].apply(append_to_dict, args=('<DATE>',))
nrs[incoterms].apply(append_to_dict, args=('<INCOTERMS>',))

1      None
4      None
7      None
8      None
10     None
       ... 
139    None
144    None
146    None
149    None
152    None
Length: 65, dtype: object

In [133]:
# now we need to map through the dataset and change the values.

def mapping(word):
    if word in dictionary.keys():
        return dictionary[word]
    return word

df['new_mappings'] = df['words'].apply(mapping)

In [134]:
df[(df['words'] != df['new_mappings'])]

Unnamed: 0,words,sentence #,tag,new_mappings
15,22,0,O,<QUANTITY>
17,2017,0,O,<QUANTITY>
34,22,1,O,<QUANTITY>
36,2017,1,O,<QUANTITY>
52,22,2,O,<QUANTITY>
...,...,...,...,...
1478,788,39,O,<QUANTITY>
1479,790,39,O,<QUANTITY>
1480,792,39,O,<QUANTITY>
1482,17/07/17,39,O,<INCOTERMS>


In [135]:
del df['words']

In [136]:
df = df.rename(columns={'new_mappings':'words'})
df

Unnamed: 0,sentence #,tag,words
0,0,O,BRIDGESTONE
1,0,O,BRAND
2,0,GoodsDescription,TIRE(S)
3,0,GoodsDescription,AND
4,0,GoodsDescription,O-RING(S)
...,...,...,...
1483,39,Incoterms,HONG
1484,39,Incoterms,KONG
1485,39,O,ORDER
1486,39,O,NF


In [137]:
cols = ['words', 'sentence #', 'tag']
df[cols].to_csv('../../data/dataset_from_json_smaller_vocab_v2.csv')