##Functions

In [1]:
import requests
from bs4 import BeautifulSoup
import html5lib
import pandas as pd
import numpy as np
import datetime
from urllib.request import urlopen

from IPython.display import HTML
import base64

import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
nltk.download('stopwords')
nltk.download('punkt')

!pip install unidecode
import unidecode

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Collecting unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 4.7MB/s 
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.1.1


In [0]:
# My dataminer:
def data_miner(url_root='',
               qs=[],
               tag='',
               tag_attribute={'':''},
               raw=True):
  
  print('Mining started at: ',
        str(datetime.datetime.now()),
        ' UTC time.')
  
  urls = []
  for q in qs:
    url = url_root+q
    urls.append(url)

  data_results = []

  for url in urls:
    r = urlopen(url)
    soup = BeautifulSoup(r, "html5lib")
    items_of_interest = soup.find_all(tag, tag_attribute)
  
    l = []

    if raw==True:
      raw_l = []
      for item_of_interest in items_of_interest:
        raw_text = item_of_interest
        raw_text = str(raw_text)
        raw_l.append(raw_text)

      data_results.append(raw_l)

    else:
      l = []
      for item_of_interest in items_of_interest:
        text = item_of_interest.get_text() # to avoid loading the tags
        text = str(text) # to force into a string if not one already
        l.append(text)
      
      data_results.append(l)
  
  print('Your data has '+str(len(data_results))+' entries.')
  print('Mining ended at: ',
        str(datetime.datetime.now()),
        ' UTC time.')
  
  return data_results

In [0]:
# My data downloader
def create_download_link( df, title = "Download CSV file", filename = "data.csv"):  
    csv = df.to_csv(index =True)
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

##Data mining: Marketing Researcher on Indeed.com

### *Raw data*

In [4]:
pgs = []
pgs.extend(range(0,10000,10))
string_pgs = []
for pg in pgs:
  pg = str(pg)
  string_pgs.append(pg)
len(string_pgs)

root = 'https://www.indeed.com/jobs?q=Marketing+research&start='

raw_results = data_miner(url_root=root, qs=string_pgs, tag='a',
                       tag_attribute={"class":"jobtitle turnstileLink "},
                       raw=True)

Mining started at:  2020-02-18 05:14:28.509598  UTC time.
Your data has 1000 entries.
Mining ended at:  2020-02-18 05:34:52.430094  UTC time.


In [0]:
results = []

i=0
j=0
for i in range(len(raw_results)):
  for j in range(len(raw_results[i])):
    new = raw_results[i][j].split(' href="', 1)
    new_new = new[1].split('" id=', 1)

    if new_new[0].startswith('/rc/clk') == True:
      link = new_new[0].replace('/rc/clk', 'https://www.indeed.com/viewjob')
      results.append(link)
    j+=1
  i+=1

In [7]:
links_df = pd.DataFrame(results)
links_df.columns = ['links']
print(len(links_df.links.unique()), ' unique links found from a dataframe of shape ', links_df.shape)
links_df.groupby(['links'])['links'].count()

685  unique links found from a dataframe of shape  (7682, 1)


links
https://www.indeed.com/viewjob?jk=00404e4ce127babd&amp;fccid=1800d2da1d520764&amp;vjs=3    2
https://www.indeed.com/viewjob?jk=00d71161883be75d&amp;fccid=9e215d88a6b33622&amp;vjs=3    2
https://www.indeed.com/viewjob?jk=00ee0bb7934a6677&amp;fccid=c1099851e9794854&amp;vjs=3    1
https://www.indeed.com/viewjob?jk=014bfb18b38abf40&amp;fccid=ce0a8eb75a5ecab9&amp;vjs=3    2
https://www.indeed.com/viewjob?jk=01574778d20ea057&amp;fccid=fc63f49b5b241cb1&amp;vjs=3    1
                                                                                          ..
https://www.indeed.com/viewjob?jk=fd3269b29ce5c0ba&amp;fccid=1639254ea84748b5&amp;vjs=3    1
https://www.indeed.com/viewjob?jk=fe28c269913d7507&amp;fccid=a4b7e90c6a891db3&amp;vjs=3    1
https://www.indeed.com/viewjob?jk=fe538a26182ac121&amp;fccid=74d116e6378c30fc&amp;vjs=3    2
https://www.indeed.com/viewjob?jk=ffc0109f8b16bdb2&amp;fccid=083eb9b1733c54a5&amp;vjs=3    1
https://www.indeed.com/viewjob?jk=ffebb8a7f2c41302&amp;fccid=dd3

In [8]:
links_list = links_df.links.unique()
raw_content = data_miner(qs=links_list, tag='div',
                       tag_attribute={"class":"jobsearch-jobDescriptionText"},
                       raw=False)
# \n in the data represents a new line/paragraph 
#(usually caused when the 'Enter' key is pressed during typing)

Mining started at:  2020-02-18 05:39:20.379951  UTC time.
Your data has 685 entries.
Mining ended at:  2020-02-18 05:42:33.093804  UTC time.


In [9]:
new_data_df = pd.DataFrame(raw_content)
new_data_df = new_data_df.astype('str')
new_data_df.columns = ['text']

stop_time = str(datetime.datetime.now())
print('Mining completed at: ', stop_time, 'UTC time')
new_data_df.shape

Mining completed at:  2020-02-18 05:56:01.614057 UTC time


(685, 1)

In [10]:
name1 = 'raw_content_mined_' + stop_time + '.csv'
create_download_link(new_data_df, filename=name1)

Output hidden; open in https://colab.research.google.com to view.

### *Clean data*

In [0]:
clean_jds = new_data_df

# Remove any accents present
i=0
for i in range(len(clean_jds['text'])):
  clean_jds['text'].iloc[i] = unidecode.unidecode(clean_jds['text'].iloc[i])

# Strip symbols
clean_jds['text'] = clean_jds['text'].str.replace('[^\w\s]', ' ')

# Make everything lowercase
clean_jds['text'] = clean_jds['text'].str.lower()

# Remove the paragraph/line marker
clean_jds['text'] = clean_jds['text'].str.replace('\n', ' ')

In [12]:
# Save the clean jds:
jd_df = pd.DataFrame(clean_jds['text'])

#Download the clean data to visualize offline:
name2 = 'clean_data_mined_' + stop_time + '.csv'
create_download_link(jd_df, filename=name2)

Output hidden; open in https://colab.research.google.com to view.

##Data mining: Marketing Data Scientist on Indeed.com

In [0]:
pgs = []
pgs.extend(range(0,50,10))
string_pgs = []
for pg in pgs:
  pg = str(pg)
  string_pgs.append(pg)
len(string_pgs)

root = 'https://www.indeed.com/jobs?q=Marketing+data+scientist&start='

raw_results = data_miner(url_root=root, qs=string_pgs, tag='a',
                       tag_attribute={"class":"jobtitle turnstileLink "},
                       raw=True)