In [1]:
import pandas
import numpy
from pymongo import MongoClient
from nltk.tokenize import TweetTokenizer
from itertools import permutations
import time

In [2]:
client = MongoClient("localhost", 27017)
job_table = client["job-search-database"]["jobs"]

In [3]:
import re
link_pattern = re.compile('<a href[^<]+</a>')
noise_pattern = re.compile('^\W$')

In [4]:
end_of_sentence = '<EOS>'

In [5]:
def tokenize(job):
    job = job.lower()
    job = link_pattern.sub('', job)
    job = (
        job.replace('<p>', '')
        .replace('&#x27;', "'")
        .replace('&quot;', '"')
        .replace('|', '')
    )
    tokenizer = TweetTokenizer()
    tokens = tokenizer.tokenize(job)
    tokens = [ t if t != '.' else end_of_sentence for t in tokens ] # For synthetic data shuffling
    tokens = [ t for t in tokens if should_keep_token(t) ]
    return tokens

In [6]:
def should_keep_token(token):
    return (
        re.match(noise_pattern, token) == None and
        not token.startswith('@') and
        not token.startswith('#')
    )

In [7]:
def process(data):
    data.drop(['_id', 'by', 'id', 'parent', 'date'], axis=1, inplace=True)
    data = data[data['text'].isnull() == False]
    data = data[data['preferred'].isnull() == False]
    data['preferred'] = data['preferred'].map(lambda x: 1 if x else 0)
    data['tokens'] = data['text'].map(tokenize)
    data = data[data.tokens != 'NC']
    data = data[data.tokens.apply(lambda x: len(x) > 20)]
    data.reset_index(inplace=True)
    data.drop('index', axis=1, inplace=True)
    return data

In [8]:
preferred = pandas.DataFrame(job_table.find({ '$and': [{ 'preferred': { '$exists': True } }, { 'text': { '$exists': True } }] }))

In [9]:
processed = process(preferred)

In [10]:
samples = processed[processed['preferred'] == 1]['tokens']

In [11]:
samples.reset_index(drop=True, inplace=True)

In [12]:
def find_split(i, available_splits, total_splits):
    return ((i + 1) * max((available_splits // total_splits), 1)) - 1

In [25]:
def group_sentences(tokens, max_splits = 2):
    av_splits = []
    for i in range(len(tokens)):
        if tokens[i] == end_of_sentence and i != 0 and i != (len(tokens) - 1):
            av_splits.append(i)

    collection = []
    total_splits = min(max_splits, len(av_splits))
    calc_split = lambda x: find_split(x, len(av_splits), total_splits)

    if total_splits == 0:
        collection.append(tokens)
    else:
        for i in range(total_splits + 1): # total groups is 1 more than total splits
            if i == 0:
                collection.append(tokens[:av_splits[calc_split(i)]])
            elif i == total_splits:
                collection.append(tokens[av_splits[calc_split(i - 1)]:])
            else:
                collection.append(tokens[av_splits[calc_split(i - 1)]: av_splits[calc_split(i)]])

    return collection
        

In [14]:
def flatten(arr):
    return [ x for sub in arr for x in sub ]

In [15]:
def time_perm(n):
    start = time.time()
    perms = list(permutations(grouped[n]))
    print(f"{n} took {time.time() - start} sec")
    return perms

In [26]:
group_sentences(samples[0])

found [27, 54, 115]
taking from 27 for i of 0
taking from 54 for i of 1
taking from 115 for i of 2


[['md.ai',
  'software',
  'engineer',
  'full-time',
  'new',
  'york',
  'ny',
  'seattle',
  'wa',
  'onsite',
  'or',
  'remote',
  'usa',
  'only',
  'we',
  'are',
  'a',
  'medical',
  'ai',
  'development',
  'platform',
  'currently',
  'focused',
  'on',
  'radiology',
  'pathology',
  'dermatology'],
 ['<EOS>',
  'we',
  'help',
  'build',
  'high-quality',
  'labeled',
  'datasets',
  'for',
  'both',
  'training',
  'and',
  'clinical',
  'validation',
  'as',
  'well',
  'as',
  'provide',
  'tools',
  'and',
  'infrastructure',
  'for',
  'deploying',
  'and',
  'running',
  'models',
  'at',
  'scale'],
 ['<EOS>',
  'some',
  'of',
  'our',
  'unique',
  'challenges',
  'include',
  'operating',
  'in',
  'hipaa-compliant',
  'environments',
  'working',
  'with',
  'large',
  'medical',
  'imaging',
  'text',
  'genomic',
  'datasets',
  'managing',
  'machine',
  'learning',
  'model',
  'lifecycles',
  'and',
  'building',
  'complex',
  'web',
  'applications',
  'w

In [50]:
grouped = [ group_sentences(tokens) for tokens in samples ]

In [58]:
p = list(permutations(grouped[0]))
len(p)

6

In [66]:
expanded = []
for group in grouped:
    for x in permutations(group):
        my_groups.append(flatten(x))

In [67]:
len(my_groups)

370

In [59]:
o = flatten(p)

In [60]:
len(o)

18

In [18]:
expanded = []
for i in range(len(grouped)):
    expanded.append()

In [25]:
max_len = 0
j = -1
for i in range(34):
    if len(grouped[i]) > max:
        max_len = len(grouped[i])
        j = i
max, j

(10, 24)

In [28]:
time_perm(0)

0 took 5.4836273193359375e-06 sec


[(['md.ai',
   'software',
   'engineer',
   'full-time',
   'new',
   'york',
   'ny',
   'seattle',
   'wa',
   'onsite',
   'or',
   'remote',
   'usa',
   'only',
   'we',
   'are',
   'a',
   'medical',
   'ai',
   'development',
   'platform',
   'currently',
   'focused',
   'on',
   'radiology',
   'pathology',
   'dermatology'],
  ['<EOS>',
   'some',
   'of',
   'our',
   'unique',
   'challenges',
   'include',
   'operating',
   'in',
   'hipaa-compliant',
   'environments',
   'working',
   'with',
   'large',
   'medical',
   'imaging',
   'text',
   'genomic',
   'datasets',
   'managing',
   'machine',
   'learning',
   'model',
   'lifecycles',
   'and',
   'building',
   'complex',
   'web',
   'applications',
   'with',
   'ui',
   'ux',
   'appealing',
   'to',
   'both',
   'doctors',
   'and',
   'engineers',
   'alike.we',
   'are',
   'currently',
   'looking',
   'for',
   'front-end',
   'developers',
   'react',
   'graphql',
   'and',
   'software',
   'engi

In [38]:
synthesized = flatten([ list(permutations(group)) for group in grouped ])

In [21]:
l = [ list(permutations(g)) for g in grouped ]

In [2]:
flatten

NameError: name 'flatten' is not defined

In [106]:
samples

0     [md.ai, software, engineer, full-time, new, yo...
1     [software, engineer, remote, us, canada, full,...
2     [monadical.com, senior, full-stack, engineer, ...
3     [revolut, software, engineers, and, many, more...
4     [proteinqure, computational, drug, design, sen...
                            ...                        
72    [genesis, therapeutics, south, san, francisco,...
73    [rally, health, multiple, openings, back-end, ...
74    [archerdx, boulder, colorado, software, engine...
75    [national, robotics, engineering, center, soft...
76    [10x, genomics, www.10xgenomics.com, pleasanto...
Name: tokens, Length: 77, dtype: object

In [130]:
len([ x for x in samples if sentence_key in x ])

75

In [96]:
removed = [ x[125:] for x in samples if len(x) >= 125]

In [98]:
len(removed)

50

In [5]:
removed[1]

NameError: name 'removed' is not defined

In [30]:
p = list(permutations([["0", "1"],["0", "2"],["2","2"]]))

In [34]:
list(p[0])

[['0', '1'], ['0', '2'], ['2', '2']]

In [35]:
[ x for sub in p[0] for x in sub ]

['0', '1', '0', '2', '2', '2']