# Post Output Culling

## Imports and Functions

In [0]:
import pandas as pd
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [0]:
# Function to make the output more readable
def print_line(line):
  while len(str(line)) > 90:
    print(line[:90])
    line = line[90:]
  print(line,'\n\n')

In [0]:
def cleanup(desc):
  '''Cleans output from gpt-2, start/end of description markers left in'''
  desc = desc.replace('<|startoftext|>','').replace('<|endoftext|>\n','')
  return desc.strip()

In [0]:
def word_freq(desc):
  '''Return frequency of unique words not in stopwords'''
  desc = str(desc).lower()
  full_len = len(desc.split())
  stopwords = ['founded by', 'is based in', 'was founded in', 
                'is headquartered in', 'headquarters in', 'developed by',
                'developed in', 'additional offices', 'germany',
                'france', 'china', 'california', 'india', 'wholly-owned'
                'silicon valley', 'san francisco', 'established in',
                'mountain view', 'family owned', 'family-owned', 
                'clients include', 'argentina', 'brazil', 'chile', 'colombia', 
                'japan', 'korea', 'malaysia', 'mexico', 'subsidiary',
                'formerly known as', 'venture capital', 'for more information']
  
  # Drop words that encode garbage info, thus penalizing on the frequency score
  for phrase in stopwords:
    desc = desc.replace(phrase,'')
  
  split_desc = str(desc).split()
  
  # Calculate unique word frequency, return 0 if description is too small
  if ((len(split_desc) < 10) | (long_word(split_desc))):
    return 0
  return len(set(split_desc)) / full_len

In [0]:
def entity_freq(text, nlp):
  '''Return frequency of low value entities'''
  doc = nlp(text)
  count = 0
  
  # Use Spacy to pull find entities and count them
  for X in doc.ents:
    if (X.label_ in ['ORG', 'DATE', 'PERSON', 'TIME', 'PERCENT', 'MONEY']):
      count += 1
      
  return count/len(text.split())

In [0]:
def long_word(desc, length=20):
  '''Return True if description contains words too long'''
  for word in desc:
    if len(word) >= length:
      return True
  
  return False

## Culling

In [0]:
# Read in the generated descriptions
df = pd.read_csv('https://raw.githubusercontent.com/labs15-pain-point/Data-Science/master/generated/log(6).csv')
df = df.drop('Unnamed: 0', axis=1)
df['descriptions'] = df['descriptions'].apply(cleanup)
df = df[df['descriptions'] != '']

In [146]:
# Begin culling and formatting the dataframe
start_sum = df['descriptions'].count()
print('Descriptions before any drops:', start_sum)

df['word_freq'] = df['descriptions'].apply(word_freq)
df = df[df['word_freq'] >.6]
df['ent_freq'] = [entity_freq(desc, nlp) for desc in df['descriptions']]

word_freq_cond = ((df['word_freq'] > .75) & (df['word_freq'] < .925))
ent_freq_cond = (df['ent_freq'] < .04)
df = df[word_freq_cond & ent_freq_cond].reset_index().drop('index', axis=1)

Descriptions before any drops: 28119


In [150]:
print('Descriptions after frequency windows:', df['descriptions'].count())
print('% Kept from Batch:', df['descriptions'].count()/start_sum)

Descriptions after frequency windows: 3303
% Kept from Batch: 0.11746505921263203


In [152]:
df.describe()

Unnamed: 0,word_freq,ent_freq
count,3303.0,3303.0
mean,0.816368,0.017012
std,0.046428,0.013921
min,0.751724,0.0
25%,0.777778,0.0
50%,0.807018,0.019608
75%,0.85,0.029199
max,0.923077,0.039604


In [149]:
# Look at all descriptions
for i in range(0,len(df)):
  print('Word Frequency:', df['word_freq'][i])
  print('Entity Frequency:', df['ent_freq'][i])
  print_line(df['descriptions'][i])

Word Frequency: 0.7959183673469388
Entity Frequency: 0.02040816326530612
Omise is a modern software company that offers a suite of guest analytics and compliance t
ools for hotels, industry leaders in the areas of guest engagement, guest satisfaction, an
d hotel productivity.  Omise's mission is to help improve the guest experience by enabling
 smarter guest decisions throughout the guest-centric economy. 


Word Frequency: 0.7777777777777778
Entity Frequency: 0.0
Dealroom.co is a SaaS platform that connects developers with businesses. It is revolutioni
zing how companies manage their digital operations. The platform helps companies to organi
ze, manage, and hire their teams. It also enables companies to easily hire, let, and manag
e additional employees through their apps. 


Word Frequency: 0.9090909090909091
Entity Frequency: 0.0
Riiid develop a digital platform of care that enables individuals and organisations to con
nect, communicate, and engage in commerce.  Its technology offer

# Exploration

In [0]:
def discipline_freq(desc, bow):
  import string
  
  desc = desc.translate(str.maketrans('', '', string.punctuation)).lower()
  desc = desc.split(' ')
  
  count = 0
  for word in desc:
    if word in bow:
      count += 1
      
  return count/len(desc)

## Nearest Neighbors Distance

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

real_desc = pd.read_csv('big_boy_df.csv')
real_desc = real_desc[real_desc['Full Description'].notnull()]['Full Description']

# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Create a vocabulary and get word counts per document
dtm = tfidf.fit_transform(real_desc)

# Instantiate the model
nn = NearestNeighbors(n_neighbors=2, algorithm='ball_tree')

# Fit on TF-IDF Vectors
nn.fit(dtm.todense())

NearestNeighbors(algorithm='ball_tree', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=2, p=2,
                 radius=1.0)

In [0]:
def get_distance(desc, nn_model):
  new = tfidf.transform([desc])
  neighborhood = nn.kneighbors(new.todense())
  return neighborhood[0][0][0]

In [33]:
df['dist_to_real'] = [get_distance(desc, nn) for desc in df['descriptions']]

KeyboardInterrupt: ignored

In [0]:
df.head()

In [0]:
df.describe()

## Cosine Similarity

In [0]:
# Scikit Learn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

df_real = pd.read_csv('https://raw.githubusercontent.com/labs15-pain-point/Data-Science/master/crunchbase_csv/companies-8-25-2019-103.csv')
df_real = df_real['Full Description']
df_real = df_real[df_real.notnull()]
real_size = df_real.shape[0]
df_real = df_real.append(df_slim['descriptions'])

# Create the Document Term Matrix
count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()

sparse = count_vectorizer.fit_transform(df_real)

cs_df = pd.DataFrame(sparse.todense())
cs = cosine_similarity(cs_df, cs_df)

(360,) (7417, 3)
(7777,)


In [0]:
cs = pd.DataFrame(cs)
df_slim['cos_avg'] = [(cs[i][0:real_size].sum()-1)/real_size 
                      for i in range(real_size, len(cs))]

In [0]:
df_slim.describe()

Unnamed: 0,index,freq,cos_avg
count,7417.0,7417.0,7417.0
mean,16855.713092,0.815565,0.232656
std,9686.573436,0.046334,0.054956
min,0.0,0.751678,-0.000858
25%,8454.0,0.776316,0.200365
50%,16759.0,0.80597,0.238121
75%,25178.0,0.848485,0.272422
max,33679.0,0.924528,0.35964


## Real Descriptions

Are the culling techniques dropping too much? What frequency score do real descriptions get? Etc.

In [0]:
import pandas as pd

In [0]:
with open('gpt2_gentext_20190823_223000.txt', 'r') as in_file:
  desc = [line for line in in_file]
df = pd.DataFrame({'descriptions':desc})
df.to_csv('log.csv')

In [0]:
df = pd.read_csv('log.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,descriptions
0,0,""" Robotics is working with leading organizatio..."
1,1,"12858,""REVIVIA suggests how cities build safe ..."
2,2,"12859,""Res434 builds software solutions that c..."
3,3,"12860,Providing a SaaS, Cloud computing suite ..."
4,4,"12861,""REMO is a high-growth iOS and Android d..."


In [0]:
df['word_freq'] = df['Full Description'].apply(word_freq)

In [0]:
df.describe()

Unnamed: 0,word_freq
count,1000.0
mean,0.752763
std,0.157793
min,0.0
25%,0.703704
50%,0.764257
75%,0.833333
max,1.0


In [0]:
for desc in df[df['word_freq'] < .70]['Full Description']:
  print_line(desc)

The mission of Meituan Dianping is “We help people eat better, live better”. As China's le
ading e-commerce platform for services, Meituan operates well-known mobile apps in China, 
including Meituan, Dianping, Meituan Waimai, Meituan Dache, Mobike and others. Meituan off
ers over 200 service categories, including catering, on-demand delivery, car-hailing, bike
-sharing, hotel and travel booking, movie ticketing, and other entertainment and lifestyle
 services, and covers 2800 cities and counties across China. The total transaction amount 
of Meituan reached RMB 5156.4 billion in the end of 2018, with an increase of 44.3% over t
he same period of last year. The total annual numbers of transaction users and active onli
ne merchants of Meituan reached 400 million and 5.8 million respectively in the past 12 mo
nths as of December 31, 2018. Meituan Dianping (stock code: 3690.HK) was officially listed
 on the Main Board of The Stock Exchange of Hong Kong Limited(HKEX) on September 20, 2018.