### Loading Jan Data & Packages

In [1]:
# Load packages
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen
import html5lib
import pandas as pd
import numpy as np
import base64
import datetime
from IPython.display import HTML
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
nltk.download('stopwords')
nltk.download('punkt')

!pip install unidecode
import unidecode

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Load data
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cleaned_jds = pd.read_csv('/content/drive/My Drive/Data Science/Sentiment Analysis: Automating Search & Analysis/test run output/Jan 2020 from Indeed/Final Analysis files/jd_data_mined_2020-01-13_09.19.40.395290.csv')
print(cleaned_jds.shape, ": original shape of dataframe")

cleaned_jds = pd.DataFrame(cleaned_jds['text'].unique())
cleaned_jds.columns = ['text']
print(cleaned_jds.shape, ": shape of dataframe with only unique jds")

cleaned_jds.head(2)

(924, 2) : original shape of dataframe
(918, 1) : shape of dataframe with only unique jds


Unnamed: 0,text
0,caring for the world one person at a time has ...
1,facebook s mission is to give people the power...


### Overall Functions

In [0]:
# My dataminer:
def data_miner(url_root='',
               qs=[],
               tag='',
               tag_attribute={'':''}):
  
  print('Mining started at: ',
        str(datetime.datetime.now()),
        ' UTC time.')
  
  urls = []
  for q in qs:
    url = url_root+q
    urls.append(url)

  data_results = []
  for url in urls:
    r = urlopen(url)
    soup = BeautifulSoup(r, "html5lib")
    items_of_interest = soup.find_all(tag, tag_attribute)
    
    l = []
    for item_of_interest in items_of_interest:
      text = item_of_interest.get_text() # to avoid loading the tags
      text = str(text) # to force into a string if not one already
      l.append(text)
      
    data_results.append(l)
  
  print('Your data has '+str(len(data_results))+' entries.')
  print('Mining ended at: ',
        str(datetime.datetime.now()),
        ' UTC time.')
  
  return data_results

In [0]:
# A part of speech counter
def pos_counter(s = pd.Series(), part_of_speech = ''):
  print('N/B: This function should work with a Pandas series containing lists')
  pos_count = []
  for my_list in s:
    answer = my_list.count(part_of_speech)
    pos_count.append(answer)
  
  print('Resulting values are ', len(pos_count))
  new_s = pd.Series(pos_count)
  return new_s

In [0]:
# A word associator - connecting each item in a list of words to
# the words immediately preceding and following that item
unwanteds = ['  ', '   ', '    ', '     ', '      ', '       ', '        ',
             '         ', '          ', ' -- ', ' - ']
def word_associator(series=pd.Series(),
                    items=[],
                    unwanteds=unwanteds):
  # Remove any unwanted expressions
  # (this includes whitespace greater then 1 space but less than 11 spaces)
  for unwanted in unwanteds:
    series = series.str.replace(unwanted, ' ')
  
  # Get the word neighbours
  for item in items:
    pattern_a = ' ' + item + ' '
    pattern_b = '-' + item + '-'
    series = series.str.replace(pattern_a, pattern_b)
  
  return series

In [0]:
# Functions that combined, create a frequency distribution table
def token_creator(series, col_name='tokenized_text', df=pd.DataFrame()):
  tokenized_text = [word_tokenize(i) for i in series]
  df.loc[:, col_name] = tokenized_text
  return df

def word_frequency(df, col_name='tokenized_text',
                   allWords=[]): 
  for wordList in df[col_name]:
    allWords += wordList
  # Get their frequency
  frequency_distribution = nltk.FreqDist(allWords)
  # and save it as a frequency distribution table i.e. a df
  freq_df = pd.DataFrame.from_dict(frequency_distribution, orient='index')
  freq_df.reset_index(level=0, inplace=True)
  freq_df.columns = ['expression','frequency']
  freq_df = freq_df.sort_values(by='frequency', ascending=False)
  return freq_df


In [0]:
# phrase analyzer
def phrase_reporter(neighbour=pd.Series(), df=pd.DataFrame(),
                    unwanteds=unwanteds, pattern_a='', pattern_b=''):
  print('Warning: Restart raw data and function cells before running this instance of the function')
  neighbour
  neighbour = cleaned_jds['text'] # this should preferably have semi-clean text
  for unwanted in unwanteds:
    neighbour = neighbour.str.replace(unwanted, ' ')
  
  i = 0
  
  neighbour = neighbour.str.replace(pattern_a, pattern_b)
  
  tokens = token_creator(series=neighbour,
                         col_name='tokenized_words')
  
  freq_table = word_frequency(df=tokens,
                             col_name='tokenized_words')
  
  print('Top 5 expressions for: ', '"', pattern_a, '"')
  report = freq_table.loc[freq_table['expression'].str.contains(pattern_b)==True]
  print(report.head(5))
  print(' ')

  print('Total frequency: ', report.frequency.sum())
  print('Total number of expressions: ', report.expression.count())
  print('Minimum frequency: ', report.frequency.min())
  print('Maximum frequency: ', report.frequency.max())
  print(' ')

  average = report.frequency.sum()/report.expression.count()
  print('Therefore, the average expression of ', '"', pattern_a, '"',
        ' has a frequency of ', str(average), ';')
  
  median = report.frequency.median()
  condition = report.loc[report['frequency'] == median]
  print('some median expressions of ', '"', pattern_a, '"',
        ' are ', ' as shown below:')
  print(condition.head(5))
  print('(', '"', pattern_a, '"', ' had ', condition.shape[0],
        ' median expressions')
  print('-----END OF ANALYSIS-----')
  print(' ')
  print('---------------------------------------------------------------------')
  print(' ')
  return

In [0]:
# A data downloader
def create_download_link( df, title = "Download CSV file", filename = "data.csv"):
  csv = df.to_csv(index =True)
  b64 = base64.b64encode(csv.encode())
  payload = b64.decode()
  html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
  html = html.format(payload=payload,title=title,filename=filename)
  return HTML(html)

### Word Frequencies across all text

In [0]:
# We start by examining the frequency of all words
# Let's create a separate dataset examining this:
words = cleaned_jds['text']

In [0]:
# Next, we connect words with apostrophes to represent shortened words like:
shorts = ['m', 're', 's']
for short in shorts:
  short_a = ' ' + short + ' '
  short_b = '-' + short + ' '
  words = words.str.replace(short_a, short_b)

In [0]:
# And call on our functions to do the job for us!
word_tokens = token_creator(series=words, col_name='word_tokenized_text')
word_freq_table = word_frequency(df=word_tokens, col_name='word_tokenized_text')
word_freq_df = word_freq_table

In [0]:
# Let's test and see that shortened words are functional
test_copy = pd.DataFrame(word_freq_df['expression'])
test_copy.columns = ['word']
truth1 = test_copy['word'].str.contains("-m")
truth2 = test_copy['word'].str.contains("-re")
truth3 = test_copy['word'].str.contains("-s")
truths = [truth1, truth2, truth3]
i=1
for truth in truths:
  t = 'truth'+str(i)
  test_copy.loc[:, t] = truth
  print(test_copy.groupby(t)[t].count())
  print('')
  i+=1

truth1
False    15210
True        36
Name: truth1, dtype: int64

truth2
False    15229
True        17
Name: truth2, dtype: int64

truth3
False    14723
True       523
Name: truth3, dtype: int64



### Word Classification

In [0]:
# Our dataset of interest has the following shape:
word_freq_df = word_freq_df.reset_index(drop=True)
print(word_freq_df.shape)
word_freq_df.head(3)

(15246, 2)


Unnamed: 0,expression,frequency
0,and,30373
1,to,15382
2,the,14576


In [0]:
# The first 50 values of the 'expressions' series becomes a list of queries:
ndf = pd.DataFrame()
ndf = word_freq_df.head(50)
queries = ndf.expression.to_list()

print(len(queries), ' queries were selected')
percentile = len(queries)/word_freq_df.shape[0] * 100
print('These queries were in the top ', percentile, '%')

50  queries were selected
These queries were in the top  0.32795487340941887 %


In [0]:
# Next, mine the parts of speech for each word and save in a list
p_o_s = data_miner(url_root='https://dictionary.cambridge.org/dictionary/english/',
                   qs=queries,
                   tag='span',
                   tag_attribute={'class':'pos dpos'})

Mining started at:  2020-01-30 12:13:53.180656  UTC time.
Your data has 50 entries.
Mining ended at:  2020-01-30 12:14:50.959049  UTC time.


In [0]:
pos_df = pd.DataFrame(ndf['expression'])
pos_df.loc[:, 'Part of Speech'] = p_o_s
pos_df.head(3)

Unnamed: 0,expression,Part of Speech
0,and,"[conjunction, conjunction]"
1,to,"[preposition, adverb, preposition]"
2,the,"[determiner, definite article]"


In [0]:
# Count the number of occurrence of the 3 parts of speech that we wish to
# examine:
# 1. Noun
# 2. Verb
# 3. Pronoun
parts = ['noun', 'verb', 'pronoun']

for part in parts:
  string = part + '_count'
  pos_df.loc[:, string] = pos_counter(pos_df['Part of Speech'], part)

N/B: This function should work with a Pandas series containing lists
Resulting values are  50
N/B: This function should work with a Pandas series containing lists
Resulting values are  50
N/B: This function should work with a Pandas series containing lists
Resulting values are  50


In [0]:
# Now that we have classified the data, let us examine the first 50 results:
pos_df

Unnamed: 0,expression,Part of Speech,noun_count,verb_count,pronoun_count
0,and,"[conjunction, conjunction]",0,0,0
1,to,"[preposition, adverb, preposition]",0,0,0
2,the,"[determiner, definite article]",0,0,0
3,of,"[preposition, preposition]",0,0,0
4,in,"[preposition, adverb, adjective, noun, prefix,...",4,0,0
5,a,"[determiner, noun, preposition, noun]",2,0,0
6,with,"[preposition, preposition]",0,0,0
7,for,"[preposition, conjunction, preposition, conjun...",0,0,0
8,marketing,"[noun, noun, noun]",3,0,0
9,or,"[conjunction, noun, conjunction, noun, noun]",3,0,0


In [0]:
namey = 'pos_word_data_analysed_' + str(datetime.datetime.now()) + '.csv'
create_download_link(pos_df, filename=namey)

Therefore, the most frequent nouns out of the 50 most frequent words could be:
1.    'in.' - short form for inches / 'IN' - short form for the US state of Indiana

2.    'A' - short form for amperes / 'A' - the first letter of the English alphabet / 'A' - a musical note / 'A' - a test score

3.    'marketing'

4.    'OR' - short form for operating room / 'OR' - short form of the US state of Oregon / 'OR' - short form for Operations Research

5.    'research'

6.    'experience'

7.    'work'

8.    'will' - mental power / 'will' - an official document of what a person has decided should be done with their money and property after their death

9.    'team'

10.    'skills'

11.    'business'

12.    'ability'

13.    'data'

14.   'management'

15.   'I' - the ninth letter of the English alphabet / 'i' - the roman numeral of the number 1

16.   'the new' - new things

17.   'media'

18.   'development'

19.   'product'

20.   'support'

21.   'amp' - short form for amperes / 'amp' - an amplifier / 'amp' - the unicode representation of an ampersand

22.   'market'

23.   'job'

The most frequent verbs out of the 50 most frequent words could be:
1.   'research'

2.   'is'

3.   'experience'

4.   'work'

5.   'will'

6.   'be'

7.   'are'

8.   'team'

9.   'support'

10.  'have'

11.  'market'

12.  'to job' - to do work for different people without being employed by them permanently



And the most frequent pronouns out of the 50 most frequent words could be:
1.   'our'

2.   'you'

3.   'we'

4.   'that' - A [relative pronoun](https://dictionary.cambridge.org/grammar/british-grammar/this-that-these-those?q=this)

5.   'all' - We can use **all** as a pronoun in formal situations, for example: *All were happy with the outcome.* (less formal: *Everyone was happy with the outcome.*). Usually, all as a pronoun is premodified or postmodified, for instance: *More than 100 people came to the refugee centre. Almost all had lost family members or property or both.*

6.   'this' - A [relative pronoun](https://dictionary.cambridge.org/grammar/british-grammar/this-that-these-those?q=this)

7.   'other' - When used as a pronoun, it is the second of two things or people, or the thing or person that is left in a group or set of things, for example: *Hold the racquet in one hand and the ball in the other.*

8.   'I'

9.   'your'


### EDA: Words & Their Neighbours

#### Are our classifications of nouns, verbs and pronouns applicable in the data context?
In this section, word associations for each of the selected members of each class are conducted to find if each word is most frequently used as a noun, a pronoun or a verb.

In [0]:
tops = ['in', 'a', 'marketing', 'or', 'research' , 'experience', 'work', 'will',
        'team', 'skills', 'business', 'ability', 'data', 'management', 'i',
        'new', 'media', 'development', 'product', 'support', 'amp', 'market',
        'job', 'is', 'be', 'are', 'have', 'our', 'you', 'we', 'that', 'all',
        'this', 'other', 'your']

In [0]:
i=0
for i in range(len(tops)):
  str_a = ' '+tops[i]+' '
  str_b = '-'+tops[i]+'-'
  phrase_reporter(pattern_a=str_a,
                  pattern_b=str_b)

Top 5 expressions for:  "  in  "
               expression  frequency
935             work-in-a        109
842   degree-in-marketing         81
598       experience-in-a         79
1859        assist-in-the         74
1503    experience-in-the         53
 
Total frequency:  8203
Total number of expressions:  5167
Minimum frequency:  1
Maximum frequency:  109
 
Therefore, the average expression of  "  in  "  has a frequency of  1.5875749951616025 ;
some median expressions of   in   are   as shown below:
                 expression  frequency
16173          meals-in-our          1
16304     best-in-execution          1
16300        assist-in-data          1
16444     or-in-development          1
3195   schedules-in-nielsen          1
( "  in  "  had  4138  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  "  a  "
         expression  frequency
19728     in-a-fast        164
19800     is-a-plus   

#### What words are associated with the top nouns, verbs and pronouns?
In this section, word associations for each of the top noun expressions and the shortlisted nouns were studied jointly.

##### *Top Noun Expressions (Based on the top 5 values for each noun association independent of the other words)*

###### **Ability**
Top expression:  *'ability to'*

In [0]:
# Let's examine the noun and its descriptor
my_list_a = [' ability ','ability ',' ability']
my_list_b = ['-ability-','ability-','-ability']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
# Let's examine the noun and its descriptor
my_list_a = ['ability to ','ability to multi ',' ability to',' ability to ']
my_list_b = ['ability-to-','ability-to-multi-','-ability-to','-ability-to-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
# Let's examine the noun and its descriptor
my_list_a = ['ability to work ','ability to work with a ',
             'ability to work with all ','ability to work in ',
             'ability to work in a ','ability to work in an ',
             'ability to manage ','ability to manage multiple ',
             'ability to manage a ','ability to communicate ',
             'ability to prioritize ']
my_list_b = ['ability-to-work-','ability-to-work-with-a-',
             'ability-to-work-with-all-','ability-to-work-in-',
             'ability-to-work-in-a-','ability-to-work-in-an-',
             'ability-to-manage-','ability-to-manage-multiple-',
             'ability-to-manage-a-','ability-to-communicate-',
             'ability-to-prioritize-']

i=0
for i in range(11):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " ability to work  "
                         expression  frequency
312   ability-to-work-independently         67
3195             ability-to-work-in         57
1820           ability-to-work-with         26
1779    ability-to-work-effectively         24
5438           ability-to-work-well         23
 
Total frequency:  292
Total number of expressions:  51
Minimum frequency:  1
Maximum frequency:  67
 
Therefore, the average expression of  " ability to work  "  has a frequency of  5.7254901960784315 ;
some median expressions of  " ability to work  "  are   as shown below:
                        expression  frequency
14192  dictationability-to-work-as          1
12398     ability-to-work-multiple          1
14632         ability-to-work-from          1
11796      ability-to-work-without          1
12167           ability-to-work-10          1
( " ability to work  "  had  29  median expressions
-----END OF ANALYSIS-----
 
----------------------------------------

###### **Development**
Top expression:  *'the development of'*

In [0]:
# Let's examine the noun and its descriptor
my_list_a = [' development ','development ',' development']
my_list_b = ['-development-','development-','-development']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
my_list_a = ['the development of ',' the development of',' the development of ']
my_list_b = ['the-development-of-','-the-development-of','-the-development-of-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
my_list_a = ['the development of the ','the development of new ','the development of marketing ']
my_list_b = ['the-development-of-the-','the-development-of-new-','the-development-of-marketing-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
my_list_a = [' in the development of',' to the development of',' on the development of']
my_list_b = ['-in-the-development-of','-to-the-development-of','-on-the-development-of']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

###### **Media**
Top expression:  *'social media and'*

In [0]:
# Let's examine the noun and its descriptor
my_list_a = [' social media ','social media ',' social media']
my_list_b = ['-social-media-','social-media-','-social-media']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
my_list_a = ['and social media ',' and social media',' and social media ']
my_list_b = ['and-social-media-','-and-social-media','-and-social-media-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
# What insights can we derive around media?
my_list_a = [' social media and ','social media and ',' social media and']
my_list_b = ['-social-media-and-','social-media-and-','-social-media-and']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
# What insights can we derive around media?
my_list_a = ['social media and digital ','social media and content ']
my_list_b = ['social-media-and-digital-','social-media-and-content-']

i=0
for i in range(2):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
# What insights can we derive around media?
my_list_a = [' of social media']
my_list_b = ['-of-social-media']

i=0
for i in range(1):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
my_list_a = [' the social media and content coordinator',
             '-position objectives the social media and content coordinator',
             ' and internal communications the social media and content coordinator']
my_list_b = ['-the-social-media-and-content-coordinator',
             '-position-objectives-the-social-media-and-content-coordinator',
             '-and-internal-communications-the-social-media-and-content-coordinator']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

###### **Experience**
Top expression:  *'of experience in'*









In [0]:
# Examine the noun and its descriptor
my_list_a = [' experience ','experience ',' experience',]
my_list_b = ['-experience-','experience-','-experience']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
my_list_a = [' of experience in ',' of experience in a ',' of experience in the ',]
my_list_b = ['-of-experience-in-','-of-experience-in-a-','-of-experience-in-the-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
# What areas should the candidate be experienced in?
my_list_a = [' of experience in ',' of experience in a ',' of experience in the ',]
my_list_b = ['-of-experience-in-','-of-experience-in-a-','-of-experience-in-the-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])
# This report shows that years and 'in a' clearly stand out

In [0]:
# What areas should the candidate be experienced in?
my_list_a = ['experience in ','experience in a ','experience in the ']
my_list_b = ['experience-in-','experience-in-a-','experience-in-the-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])
# This report shows that marketing and fast clearly stand out

In [0]:
my_list_a = [' marketing ','experience in marketing ',
             'experience in marketing and ','experience in marketing or ',
             'experience in a marketing ','experience in a marketing or ',
             'experience in a marketing role ']
my_list_b = ['-marketing-','experience-in-marketing-',
             'experience-in-marketing-and-','experience-in-marketing-or-',
             'experience-in-a-marketing-','experience-in-a-marketing-or-',
             'experience-in-a-marketing-role-']

i=0
for i in range(7):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
# How many years of experience?
my_list_a = [' year ',' years ']
my_list_b = ['-year-','-years-']

i=0
for i in range(2):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])
# This report shows that years clearly stands out

In [0]:
# How many years of experience?
my_list_a = [' twelve year experience ',' 12 year experience ',
             ' twelve years experience ',' 12 years experience ',
             ' twelve year of experience ',' 12 year of experience ',
             ' twelve years of experience ',' 12 years of experience ']
my_list_b = ['-twelve-year-experience-','-12-year-experience-',
             '-twelve-years-experience-','-12-years-experience-',
             '-twelve-year-of-experience-','-12-year-of-experience-',
             '-twelve-years-of-experience-','-12-years-of-experience-']

i=0
for i in range(8):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

###### **Management**
Top expression:  *'project management and'*

In [10]:
# Let's examine the noun and its descriptor
my_list_a = ['management ',' management',' management ']
my_list_b = ['management-','-management','-management-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " management  "
                 expression  frequency
1070         management-and        196
2984      management-skills        104
1143          management-of         84
4290  management-experience         58
3021          management-or         27
 
Total frequency:  1349
Total number of expressions:  386
Minimum frequency:  1
Maximum frequency:  196
 
Therefore, the average expression of  " management  "  has a frequency of  3.494818652849741 ;
some median expressions of  " management  "  are   as shown below:
                     expression  frequency
13166  management-instructional          1
13168       management-benefits          1
13167      management-inclusion          1
13162           management-shrm          1
3260        management-blogging          1
( " management  "  had  234  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  "  management "
           

In [10]:
my_list_a = ['project management ','project management and ',
             ' management experience ','project management and organizational ']
my_list_b = ['project-management-','project-management-and-',
             '-management-experience-','project-management-and-organizational-']

i=0
for i in range(4):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " project management  "
                         expression  frequency
2970      project-management-skills         46
1318         project-management-and         34
4256  project-management-experience         19
3006          project-management-or          6
8469     project-management-support          6
 
Total frequency:  239
Total number of expressions:  93
Minimum frequency:  1
Maximum frequency:  46
 
Therefore, the average expression of  " project management  "  has a frequency of  2.5698924731182795 ;
some median expressions of  " project management  "  are   as shown below:
                              expression  frequency
14400      project-management-leadership          1
14405  project-management-prioritization          1
12286          project-management-social          1
14284      project-management-proficient          1
11858             project-management-sku          1
( " project management  "  had  67  median expressions
-----END OF ANALYSIS

In [11]:
my_list_a = [' and management']
my_list_b = ['-and-management']

i=0
for i in range(1):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  "  and management "
                       expression  frequency
14912  development-and-management          9
14917     planning-and-management          6
14918       design-and-management          3
14935     strategy-and-management          3
14942    marketing-and-management          3
 
Total frequency:  67
Total number of expressions:  39
Minimum frequency:  1
Maximum frequency:  9
 
Therefore, the average expression of  "  and management "  has a frequency of  1.7179487179487178 ;
some median expressions of  "  and management "  are   as shown below:
                            expression  frequency
14913           conduct-and-management          1
14932  analysis-and-managementdecision          1
14934        activities-and-management          1
14939         diagnosis-and-management          1
14940       measurement-and-management          1
( "  and management "  had  25  median expressions
-----END OF ANALYSIS-----
 
----------------------------------

###### **Team**
Top expression:  *'a team of'*

In [12]:
# Let's examine the noun and its descriptor
my_list_a = ['team ',' team',' team ']
my_list_b = ['team-','-team','-team-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " team  "
         expression  frequency
14969       team-to        200
14952  team-members        144
14951      team-and        144
14963       team-of        103
14956       team-in         96
 
Total frequency:  2111
Total number of expressions:  419
Minimum frequency:  1
Maximum frequency:  200
 
Therefore, the average expression of  " team  "  has a frequency of  5.0381861575179 ;
some median expressions of  " team  "  are   as shown below:
              expression  frequency
15150  team-additionally          1
15211       team-charged          1
15146           team-key          1
15148           team-but          1
15151        team-embody          1
( " team  "  had  266  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  "  team "
             expression  frequency
15382            a-team        271
15371    marketing-team        207
15394          the-team     

In [14]:
my_list_a = ['a team ','a team of ',
             ' our team',' marketing team to ',' the marketing team to ']
my_list_b = ['a-team-','a-team-of-',
             '-our-team','-marketing-team-to-','-the-marketing-team-to-']

i=0
for i in range(5):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " a team  "
               expression  frequency
16092           a-team-of        153
16125  a-team-environment        120
16214       a-team-player         60
16229          a-team-and         42
16606         a-team-that         33
 
Total frequency:  918
Total number of expressions:  121
Minimum frequency:  1
Maximum frequency:  153
 
Therefore, the average expression of  " a team  "  has a frequency of  7.586776859504132 ;
some median expressions of  " a team  "  are   as shown below:
              expression  frequency
16695  a-team-orientated          3
17345    a-team-composed          3
17039        a-team-self          3
16245     a-team-dynamic          3
16701      a-team-innate          3
( " a team  "  had  62  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  " a team of  "
                  expression  frequency
17364        a-team-of-world         14
1738

In [15]:
my_list_a = ['a team of world ','a team of marketing ','a team of account ']
my_list_b = ['a-team-of-world-','a-team-of-marketing-','a-team-of-account-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " a team of world  "
                        expression  frequency
17483  a-team-of-world-researchers          7
 
Total frequency:  7
Total number of expressions:  1
Minimum frequency:  7
Maximum frequency:  7
 
Therefore, the average expression of  " a team of world  "  has a frequency of  7.0 ;
some median expressions of  " a team of world  "  are   as shown below:
                        expression  frequency
17483  a-team-of-world-researchers          7
( " a team of world  "  had  1  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  " a team of marketing  "
                              expression  frequency
17485      a-team-of-marketing-personnel          1
17484            a-team-of-marketing-and          1
17486  a-team-of-marketing-professionals          1
 
Total frequency:  3
Total number of expressions:  3
Minimum frequency:  1
Maximum frequency:  1
 
There

###### **Skills**
Top expression:  *'knowledge skills and'*

In [0]:
# Let's examine the noun and its descriptor
my_list_a = [' skills ','skills ',' skills']
my_list_b = ['-skills-','skills-','-skills']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
my_list_a = ['knowledge skills and ',' knowledge skills and',' knowledge skills and ']
my_list_b = ['knowledge-skills-and-','-knowledge-skills-and','-knowledge-skills-and-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

In [0]:
my_list_a = ['communication skills ',' communication skills',' and communication skills']
my_list_b = ['communication-skills-','-communication-skills','-and-communication-skills']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

###### **Research**
Top expression:  *'market research and'*

###### **Business**
Top expression:  *'small business expand'*

###### **Marketing**
Top expression:  *'of marketing and'*

###### **Data**
Top expression:  *'of data and'*

##### *Top Verb Expressions (Based on the top 5 values for each verb association independent of the other words)*

###### **Is**
Top expression:  *'this is a'*

In [0]:
# Let's examine the verb and its descriptor
my_list_a = ['this is a ',' this is a',' this is a ']
my_list_b = ['this-is-a-','-this-is-a','-this-is-a-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " this is a  "
                expression  frequency
19176       this-is-a-full         15
19186       this-is-a-paid         10
19184       this-is-a-part          4
19179  this-is-a-wonderful          4
19199      this-is-a-great          3
 
Total frequency:  80
Total number of expressions:  42
Minimum frequency:  1
Maximum frequency:  15
 
Therefore, the average expression of  " this is a  "  has a frequency of  1.9047619047619047 ;
some median expressions of  " this is a  "  are   as shown below:
                expression  frequency
19215     this-is-a-chance          1
19214  this-is-a-permanent          1
19213      this-is-a-union          1
19204        this-is-a-key          1
19203  this-is-a-freelance          1
( " this is a  "  had  31  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  "  this is a "
           expression  frequency
19234      -this-is-a  

###### **Have**
Top expression:  *'you have a'*

In [0]:
# Let's examine the verb and its descriptor
my_list_a = ['have ',' have',' have ']
my_list_b = ['have-','-have','-have-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " have  "
           expression  frequency
963            have-a        255
74           have-the        136
546           have-an         57
3294  have-experience         37
1932      have-strong         35
 
Total frequency:  1081
Total number of expressions:  247
Minimum frequency:  1
Maximum frequency:  255
 
Therefore, the average expression of  " have  "  has a frequency of  4.376518218623482 ;
some median expressions of  " have  "  are   as shown below:
           expression  frequency
12336       behave-in          1
14099  have-satisfied          1
12467     have-lapsed          1
14925          have-2          1
11977     have-expert          1
( " have  "  had  146  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  "  have "
      expression  frequency
14969  must-have        178
14968   you-have        153
14970  will-have        146
14971    we-have        1

In [0]:
# Let's examine the verb and its descriptor
my_list_a = ['you have ','you have a ','you have an ','you have the ',
             'you have experience ','you have strong ','we have ','we have a ',
             'we have an ','we have the ','we have experience ','we have strong ']
my_list_b = ['you-have-','you-have-a-','you-have-an-','you-have-the-',
             'you-have-experience-','you-have-strong-','we-have-','we-have-a-',
             'we-have-an-','we-have-the-','we-have-experience-','we-have-strong-']

i=0
for i in range(12):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " you have  "
                expression  frequency
15143           you-have-a        102
15135         you-have-the         34
15180  you-have-experience         26
15338   you-have-questions         10
15195    you-have-inflated         10
 
Total frequency:  306
Total number of expressions:  49
Minimum frequency:  2
Maximum frequency:  102
 
Therefore, the average expression of  " you have  "  has a frequency of  6.244897959183674 ;
some median expressions of  " you have  "  are   as shown below:
               expression  frequency
15523          you-have-6          2
15396  you-have-marketing          2
15520       you-have-with          2
15322         you-have-at          2
15183  you-have-navigated          2
( " you have  "  had  34  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  " you have a  "
                  expression  frequency
15655     you-have-a-pas



Top 5 expressions for:  " we have  "
            expression  frequency
15204        we-have-a         56
15139       we-have-an         30
15138  we-have-amazing          8
15140     we-have-some          8
15162      we-have-the          8
 
Total frequency:  220
Total number of expressions:  52
Minimum frequency:  2
Maximum frequency:  56
 
Therefore, the average expression of  " we have  "  has a frequency of  4.230769230769231 ;
some median expressions of  " we have  "  are   as shown below:
                expression  frequency
15621           we-have-in          2
15321          we-have-big          2
15631     we-have-assisted          2
15648         we-have-more          2
15308  we-have-researchers          2
( " we have  "  had  41  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  " we have a  "
             expression  frequency
15721    we-have-a-well          3
15718    we-have-a

###### **Are**
Top expression:  *'we are looking'*

In [0]:
# Let's examine the verb and its descriptor
my_list_a = ['are ',' are',' are ']
my_list_b = ['are-','-are','-are-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " are  "
        expression  frequency
15776  are-looking        118
15765        are-a        108
15750      are-not         76
15784       are-an         71
15757      are-you         67
 
Total frequency:  3155
Total number of expressions:  1021
Minimum frequency:  1
Maximum frequency:  118
 
Therefore, the average expression of  " are  "  has a frequency of  3.090107737512243 ;
some median expressions of  " are  "  are   as shown below:
                 expression  frequency
16662  healthcare-providers          1
16307            are-backed          1
16309            are-iconic          1
16299       software-allows          1
16298               are-for          1
( " are  "  had  629  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  "  are "
      expression  frequency
16781     we-are        570
16775    you-are        260
16798    who-are         80
16768   tha

In [0]:
# Let's examine the verb and its descriptor
my_list_a = ['you are ','you are a ','you are an ','we are ','we are a ','we are an ']
my_list_b = ['you-are-','you-are-a-','you-are-an-','we-are-','we-are-a-','we-are-an-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " you are  "
               expression  frequency
17394           you-are-a         80
17459  you-are-interested         42
17462          you-are-an         22
17477      you-are-unable         20
17472    you-are-selected         20
 
Total frequency:  520
Total number of expressions:  88
Minimum frequency:  2
Maximum frequency:  80
 
Therefore, the average expression of  " you are  "  has a frequency of  5.909090909090909 ;
some median expressions of  " you are  "  are   as shown below:
              expression  frequency
18143       you-are-keen          2
18115      you-are-smart          2
18155  you-are-associate          2
17959  you-are-motivated          2
17979      you-are-proud          2
( " you are  "  had  47  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  " you are a  "
               expression  frequency
18525   you-are-a-veteran          5
18541  y

In [0]:
# Let's examine the verb and its descriptor
my_list_a = ['you re ','you re a ','you re an ','we re ','we re a ','we re an ']
my_list_b = ['you-re-','you-re-a-','you-re-an-','we-re-','we-re-a-','we-re-an-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " you re  "
              expression  frequency
18568           you-re-a         10
18562         you-re-the          8
18584     you-re-looking          7
18563       you-re-ready          5
18565  you-re-interested          5
 
Total frequency:  102
Total number of expressions:  52
Minimum frequency:  1
Maximum frequency:  10
 
Therefore, the average expression of  " you re  "  has a frequency of  1.9615384615384615 ;
some median expressions of  " you re  "  are   as shown below:
               expression  frequency
18564  you-re-comfortable          1
18566    you-re-available          1
18567        you-re-eager          1
18569      you-re-hitting          1
18570     you-re-actually          1
( " you re  "  had  35  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  " you re a  "
              expression  frequency
18617      you-re-a-good          2
18616  you-re-

In [0]:
my_list_a = [' skills are a must','what we are looking ',' are a ']
my_list_b = ['-skills-are-a-must','what-we-are-looking-','-are-a-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  "  skills are a must "
                            expression  frequency
25082  communication-skills-are-a-must          2
25085   proofreading-skills-are-a-must          2
25083     management-skills-are-a-must          1
25084        writing-skills-are-a-must          1
 
Total frequency:  6
Total number of expressions:  4
Minimum frequency:  1
Maximum frequency:  2
 
Therefore, the average expression of  "  skills are a must "  has a frequency of  1.5 ;
some median expressions of  "  skills are a must "  are   as shown below:
Empty DataFrame
Columns: [expression, frequency]
Index: []
( "  skills are a must "  had  0  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  " what we are looking  "
                    expression  frequency
25086  what-we-are-looking-for         14
 
Total frequency:  14
Total number of expressions:  1
Minimum frequency:  14
Maximum frequency:

###### **Be**
Top expression:  *'must be able'*

In [0]:
# Let's examine the verb and its descriptor
my_list_a = ['be ',' be',' be ']
my_list_b = ['be-','-be','-be-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " be  "
           expression  frequency
19377            be-a        207
19378         be-able        202
19385  be-responsible        145
19373           be-an        103
19388     be-required         81
 
Total frequency:  2594
Total number of expressions:  584
Minimum frequency:  1
Maximum frequency:  207
 
Therefore, the average expression of  " be  "  has a frequency of  4.441780821917808 ;
some median expressions of  " be  "  are   as shown below:
          expression  frequency
19872           be-i          1
19491      be-marked          1
19846  be-constantly          1
19497           be-o          1
19946    be-nurtured          1
( " be  "  had  337  median expressions
-----END OF ANALYSIS-----
 
---------------------------------------------------------------------
 
Top 5 expressions for:  "  be "
      expression  frequency
19955    will-be        698
19954      to-be        424
19956    must-be        404
19967     may-be        216
19970   the-b

In [0]:
my_list_a = ['be responsible for ','must be able to ','will be required to ']
my_list_b = ['be-responsible-for-','must-be-able-to-','will-be-required-to-']

i=0
for i in range(3):
  phrase_reporter(pattern_a=my_list_a[i],
                  pattern_b=my_list_b[i])

Top 5 expressions for:  " be responsible for  "
                          expression  frequency
22051         be-responsible-for-the         28
22040  be-responsible-for-supporting         16
22062  be-responsible-for-developing         16
22054   be-responsible-for-assisting         16
22089     be-responsible-for-driving         10
 
Total frequency:  274
Total number of expressions:  76
Minimum frequency:  2
Maximum frequency:  28
 
Therefore, the average expression of  " be responsible for  "  has a frequency of  3.6052631578947367 ;
some median expressions of  " be responsible for  "  are   as shown below:
                          expression  frequency
22044       be-responsible-for-their          2
22111   be-responsible-for-tradeshow          2
22043        be-responsible-for-fees          2
22052     be-responsible-for-setting          2
22050  be-responsible-for-conducting          2
( " be responsible for  "  had  53  median expressions
-----END OF ANALYSIS-----
 
----------