In [3]:
import re
import string

!pip install -U nltk

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import sent_tokenize # Sentence Tokenizer
from nltk.tokenize import word_tokenize # Word Tokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.probability import FreqDist

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Requirement already up-to-date: nltk in /anaconda3/lib/python3.6/site-packages (3.4)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/danielleromanoff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielleromanoff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 1) (optional) Scrape 100 Job Listings that contain the title "Data Scientist" from indeed.com

At a minimum your final dataframe of job listings should contain
- Job Title
- Job Description

In [35]:
# Additional Imports for scraping Indeed

import requests
import time
from bs4 import BeautifulSoup

In [62]:
# Getting search results from indeed

url = 'https://www.indeed.com/jobs?as_and=data+scientist&as_phr=&as_any=&as_not=&as_ttl=&as_cmp=&jt=all&st=&as_src=&salary=&radius=25&l=New+Jersey&fromage=any&limit=50&sort=&psf=advsrch#'
page = requests.get(url)
page = page.text



In [66]:
soup = BeautifulSoup(page, 'html.parser')
listings = ([s.text for s in soup.findAll(class_='summary')])
listings

['\n                            Implement field and office data collection efforts, data validation, and data evaluation. Process data, and oversee the development of map figures, data tables,...',
 "\n                            Previous internship or relevant work experiences in scripting, software development, or data analytics. In Global Data, we're responsible for delivering this...",
 '\nData Scientists to work with partners to analyze, clean and identify gaps in data. Connect data from disparate sources to identify insights and patterns....',
 '\n                            Communication and presentation to external clients with relevance to the market and consumer insights. Consistently meets agreed upon project objectives....',
 '\n                            Leveraging your educational background in Science, Mathematics, Statistics, Computer Science, Data Science, or a related discipline, along with your relevant...',
 '\n                            Candidate MUST live in NJ,

In [68]:
def tokenize_jobs(jobs):
    # remove punctuation
    table = str.maketrans('', '', string.punctuation)
    jobs = [j.translate(table) for j in jobs]
    # tokenize words
    return [word_tokenize(j) for j in jobs]

tokens = tokenize_jobs(listings)                      
tokens

[['Implement',
  'field',
  'and',
  'office',
  'data',
  'collection',
  'efforts',
  'data',
  'validation',
  'and',
  'data',
  'evaluation',
  'Process',
  'data',
  'and',
  'oversee',
  'the',
  'development',
  'of',
  'map',
  'figures',
  'data',
  'tables'],
 ['Previous',
  'internship',
  'or',
  'relevant',
  'work',
  'experiences',
  'in',
  'scripting',
  'software',
  'development',
  'or',
  'data',
  'analytics',
  'In',
  'Global',
  'Data',
  'were',
  'responsible',
  'for',
  'delivering',
  'this'],
 ['Data',
  'Scientists',
  'to',
  'work',
  'with',
  'partners',
  'to',
  'analyze',
  'clean',
  'and',
  'identify',
  'gaps',
  'in',
  'data',
  'Connect',
  'data',
  'from',
  'disparate',
  'sources',
  'to',
  'identify',
  'insights',
  'and',
  'patterns'],
 ['Communication',
  'and',
  'presentation',
  'to',
  'external',
  'clients',
  'with',
  'relevance',
  'to',
  'the',
  'market',
  'and',
  'consumer',
  'insights',
  'Consistently',
  'meets

In [70]:
cells = [' '.join(x) for x in tokens]
cells

['Implement field and office data collection efforts data validation and data evaluation Process data and oversee the development of map figures data tables',
 'Previous internship or relevant work experiences in scripting software development or data analytics In Global Data were responsible for delivering this',
 'Data Scientists to work with partners to analyze clean and identify gaps in data Connect data from disparate sources to identify insights and patterns',
 'Communication and presentation to external clients with relevance to the market and consumer insights Consistently meets agreed upon project objectives',
 'Leveraging your educational background in Science Mathematics Statistics Computer Science Data Science or a related discipline along with your relevant',
 'Candidate MUST live in NJ Philadelphia PA area or NYC area Position will include assisting with the research data collection and reporting for Phase I',
 'Proficient with the use of advanced statistical analysis sof

In [73]:
description = pd.DataFrame({'Summary': cells})
description.head()

Unnamed: 0,Summary
0,Implement field and office data collection eff...
1,Previous internship or relevant work experienc...
2,Data Scientists to work with partners to analy...
3,Communication and presentation to external cli...
4,Leveraging your educational background in Scie...


## 2) Use NLTK to tokenize / clean the listings 

In [None]:
##### Your Code Here #####

# 3) Use Scikit-Learn's CountVectorizer to get word counts for each listing.

In [77]:
from sklearn.feature_extraction.text import CountVectorizer

# create the transform
vectorizer = CountVectorizer(lowercase=True, stop_words='english')
# tokenize and build vocab
vectorizer.fit(cells)
# Create a Vocabulary
# The vocabulary establishes all of the possible words that we might use.
counted = (vectorizer.vocabulary_)
counted
# The vocabulary dictionary does not represent the counts of words!!

{'implement': 171,
 'field': 150,
 'office': 248,
 'data': 89,
 'collection': 67,
 'efforts': 128,
 'validation': 397,
 'evaluation': 137,
 'process': 285,
 'oversee': 256,
 'development': 106,
 'map': 217,
 'figures': 151,
 'tables': 376,
 'previous': 283,
 'internship': 191,
 'relevant': 314,
 'work': 402,
 'experiences': 142,
 'scripting': 338,
 'software': 352,
 'analytics': 24,
 'global': 157,
 'responsible': 321,
 'delivering': 95,
 'scientists': 337,
 'partners': 260,
 'analyze': 25,
 'clean': 60,
 'identify': 168,
 'gaps': 154,
 'connect': 75,
 'disparate': 116,
 'sources': 355,
 'insights': 181,
 'patterns': 262,
 'communication': 70,
 'presentation': 281,
 'external': 147,
 'clients': 63,
 'relevance': 313,
 'market': 218,
 'consumer': 79,
 'consistently': 77,
 'meets': 226,
 'agreed': 17,
 'project': 294,
 'objectives': 247,
 'leveraging': 206,
 'educational': 124,
 'background': 43,
 'science': 334,
 'mathematics': 222,
 'statistics': 366,
 'computer': 73,
 'related': 311,


# 4) Visualize the most common word counts

 # 5) Use Scikit-Learn's tfidfVectorizer to get a TF-IDF feature matrix

In [None]:
##### Your Code Here #####

## Stretch Goals

 - Scrape Job Listings for the job title "Data Analyst". How do these differ from Data Scientist Job Listings
 - Try and identify requirements for experience specific technologies that are asked for in the job listings. How are those distributed among the job listings?
 - Use a clustering algorithm to cluster documents by their most important terms. Do the clusters reveal any common themes?
  - **Hint:** K-means might not be the best algorithm for this. Do a little bit of research to see what might be good for this. Also, remember that algorithms that depend on Euclidean distance break down with high dimensional data.