# Post Output Culling

## Imports and Functions

In [2]:
import pandas as pd
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [3]:
def get_csv(csv_path):
  '''Pull in CSV of generated descriptions and perform minor cleanup'''
  df = pd.read_csv(csv_path)
  
  df = df[df['descriptions'].str.startswith('<|startoftext|>') & 
          df['descriptions'].str.endswith('<|endoftext|>\n')]
  df['descriptions'] = df['descriptions'].apply(cleanup)
  df = df.drop('Unnamed: 0', axis=1).reset_index(drop=True)
  
  return df[df['descriptions'] != '']

In [4]:
def cleanup(desc):
  '''Cleans output from gpt-2, start/end of description markers left in'''
  desc = desc.replace('<|startoftext|>','').replace('<|endoftext|>\n','')
  return desc.strip()

In [5]:
def long_word(desc, length=20):
  '''Return True if description contains words too long'''
  for word in desc:
    if len(word) >= length:
      return True
  
  return False

In [6]:
def remove_stop_words(desc, nlp):
  '''Remove default spacy stopwords and punctuation from a description'''
  spacy_stopwords = spacy.lang.en.stop_words.STOP_WORDS
  
  doc = nlp(desc)
  tokens = [token.text 
            for token in doc 
            if not token.is_stop and not token.is_punct]

  return tokens

In [7]:
def remove_stop_phrases(desc):
  '''Remove phrases that encode little to no value in generated descriptions'''
  stop_phrases = ['founded by', 'is based in', 'was founded in', 
                'is headquartered in', 'headquarters in', 'developed by',
                'developed in', 'additional offices', 'germany',
                'france', 'china', 'california', 'india', 'wholly-owned'
                'silicon valley', 'san francisco', 'established in',
                'mountain view', 'family owned', 'family-owned', 
                'clients include', 'argentina', 'brazil', 'chile', 'colombia', 
                'japan', 'korea', 'malaysia', 'mexico', 'subsidiary',
                'formerly known as', 'venture capital', 'for more information',
                'new york', 'united states', 'u.s.', 'u.s.a']
  
  for phrase in stop_phrases:
    desc = desc.replace(phrase,'')
    
  return desc

In [8]:
def word_freq(desc, nlp):
  '''Return frequency of unique words not in stopwords/phrases'''
  full_desc_len = len(str(desc).split())

  desc = remove_stop_phrases(str(desc).lower())
  desc = remove_stop_words(desc, nlp)

  # Calculate unique word frequency, return 0 if description is too small
  if ((len(desc) < 10) | (long_word(desc))):
    return 0
  return len(set(desc)) / full_desc_len

In [9]:
def reduce_by_word_freq(df):
  '''Reduce df size by narrowing word frequency range'''
  lower = df['word_freq'].median()
  upper = df['word_freq'].median() + df['word_freq'].std()

  return df[(df['word_freq'] > lower) & (df['word_freq'] < upper)]

In [12]:
def entity_freq(text, nlp):
  '''Return frequency of low value entities'''
  doc = nlp(text)
  count = 0
  
  # Use SpaCy to find entities and count low value ones
  for X in doc.ents:
    if (X.label_ in ['ORG', 'DATE', 'PERSON', 'TIME', 'PERCENT', 'MONEY']):
      count += 1
      
  return count/len(text.split())

In [13]:
df = get_csv('https://raw.githubusercontent.com/labs15-pain-point/Data-Science/master/generated/log(6).csv')

df['word_freq'] = [word_freq(desc, nlp) for desc in df['descriptions']]
df = reduce_by_word_freq(df)

df['ent_freq'] = [entity_freq(desc, nlp) for desc in df['descriptions']]
df = df[df['ent_freq'] < df['ent_freq'].median()]

df = df.sample(100).reset_index(drop=True)

In [14]:
df.head()

Unnamed: 0,descriptions,word_freq,ent_freq
0,"Reebee is a fast-casual, grab-and-go, family-o...",0.527559,0.031496
1,"Ocunexus Therapeutics, Inc., is a biopharmaceu...",0.586207,0.034483
2,RiminiActive Ltd. provides cloud-based softwar...,0.515152,0.030303
3,Omise is the leading digital-media business pl...,0.564516,0.0
4,Rockets of Awesome is dedicated to helping cre...,0.527273,0.027273
