# MVP

In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# AWS Libraries
import logging
import boto3
from botocore.exceptions import ClientError

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Geo-mapping Libraries
import geopandas
import geopy
import folium 

# Helper Functions
import MVP_acquire_ds, MVP_explore

import warnings
warnings.filterwarnings("ignore")

## Data Acquisition

### Download the JSON Files from AWS S3 Bucket

In [2]:
# Create the s3 resource object
s3 = boto3.resource('s3')

# Download the json files from AWS

s3.Bucket("dspreparedjobpostings").download_file("df_ds_tx_prepared_backup.json", 
                                                 "df_ds_tx_prepared_backup.json")
s3.Bucket("wdpreparedjobpostings").download_file("df_wd_tx_prepared_backup.json", 
                                                 "df_wd_tx_prepared_backup.json")

## Data Preparation

### Load JSON Files into Dataframe and Set the Date as the Index

In [3]:
# Read the json file and convert it to pandas dataframe

df_ds = pd.read_json('df_ds_tx_prepared_backup.json')
df_wd = pd.read_json('df_wd_tx_prepared_backup.json')

# Set the date column as the index and sort the index

df_ds.date = pd.to_datetime(df_ds.date)
df_ds = df_ds.set_index('date').sort_index(ascending=False)
df_wd.date = pd.to_datetime(df_wd.date)
df_wd = df_wd.set_index('date').sort_index(ascending=False)

# Print the number of job postings in each dataframe

print("Number of data scientist job postings in TX: ", df_ds.shape[0])
print("Number of web developer job postings in TX: ", df_wd.shape[0])

Number of data scientist job postings in TX:  1899
Number of web developer job postings in TX:  3544


In [4]:
# Print the concise summary of the dataframe df_ds
df_ds.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1899 entries, 2021-02-14 to 2020-12-22
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            1899 non-null   object 
 1   company          1899 non-null   object 
 2   company_rating   1899 non-null   float64
 3   job_link         1899 non-null   object 
 4   job_description  1899 non-null   object 
 5   city             1899 non-null   object 
 6   state            1899 non-null   object 
 7   zipcode          1899 non-null   int64  
 8   clean            1899 non-null   object 
 9   tokenized        1899 non-null   object 
 10  stemmed          1899 non-null   object 
 11  lemmatized       1899 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 192.9+ KB


In [5]:
# Print the concise summary of the dataframe df_wd
df_wd.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3544 entries, 2021-02-14 to 2021-01-04
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            3544 non-null   object 
 1   company          3544 non-null   object 
 2   company_rating   3544 non-null   float64
 3   job_link         3544 non-null   object 
 4   job_description  3544 non-null   object 
 5   city             3544 non-null   object 
 6   state            3544 non-null   object 
 7   zipcode          3544 non-null   int64  
 8   clean            3544 non-null   object 
 9   tokenized        3544 non-null   object 
 10  stemmed          3544 non-null   object 
 11  lemmatized       3544 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 359.9+ KB


In [6]:
# Print the first 5 rows of the dataframe df_ds
df_ds.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean,tokenized,stemmed,lemmatized
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-14,Statistician - ELSI Research\nnew,Baylor College of Medicine,3.9,https://www.indeed.com/rc/clk?jk=8a0be9e2cb609...,Job Purpose\nThe Center for Medical Ethics and...,Houston,TX,77030,job purpose center medical ethic health policy...,job purpose\nthe center for medical ethics and...,job purpos the center for medic ethic and heal...,job purpose the center for medical ethic and h...
2021-02-14,Summer 2021 Computer Science & Data Science In...,Ericsson,4.1,https://www.indeed.com/rc/clk?jk=12c99183a6e6d...,Location: Multiple locations across United Sta...,Plano,TX,0,location multiple location across united state...,location multiple locations across united stat...,locat multipl locat across unit state as the t...,location multiple location across united state...
2021-02-14,"Senior Software Engineer, Machine Learning Pla...",Expedia Group,3.9,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,If you're the right person for the job you wil...,Austin,TX,78758,youre right person job joining growing machine...,if youre the right person for the job you will...,if your the right person for the job you will ...,if youre the right person for the job you will...
2021-02-14,Data Scientist Senior Associate - CIB Wholesal...,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=744a524cf178a...,J.P. Morgan's Corporate & Investment Bank (CIB...,Plano,TX,0,jp morgan corporate investment bank cib global...,jp morgans corporate investment bank cib is a ...,jp morgan corpor invest bank cib is a global l...,jp morgan corporate investment bank cib is a g...
2021-02-13,Senior Business Data Analyst\nnew,Intuit,4.2,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Overview\nWe are looking for a hands-on Senior...,Plano,TX,75023,overview looking handson senior analyst creati...,overview\nwe are looking for a handson senior ...,overview we are look for a handson senior anal...,overview we are looking for a handson senior a...


In [7]:
# Print the first 5 rows of the dataframe df_wd
df_wd.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean,tokenized,stemmed,lemmatized
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-14,"Associate Developer/Developer, IT Applications...",American Airlines,4.0,https://www.indeed.com/rc/clk?jk=fca7c5f1f05bc...,Location: DFW Headquarters Building 8 (DFW-SV0...,Fort Worth,TX,0,location dfw headquarters building 8 dfwsv08 a...,location dfw headquarters building 8 dfwsv08\n...,locat dfw headquart build 8 dfwsv08 addit loca...,location dfw headquarters building 8 dfwsv08 a...
2021-02-14,Experienced Ruby Software Developer\nnew,Reynolds and Reynolds,3.2,https://www.indeed.com/rc/clk?jk=8940208725aae...,As an Experienced Ruby Software Developer you ...,College Station,TX,77840,experienced ruby software developer build some...,as an experienced ruby software developer you ...,as an experienc rubi softwar develop you will ...,a an experienced ruby software developer you w...
2021-02-14,Front End Software Engineer\nnew,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=396a20704fcef...,"We are looking for strong, well rounded develo...",Plano,TX,0,looking strong well rounded developer build st...,we are looking for strong well rounded develop...,we are look for strong well round develop to b...,we are looking for strong well rounded develop...
2021-02-14,Java Developer (with React)\nnew,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=dd309e6635efc...,"As an Application Developer, the candidate wil...",Plano,TX,0,application developer candidate required provi...,as an application developer the candidate will...,as an applic develop the candid will be requir...,a an application developer the candidate will ...
2021-02-14,Full Stack Java Software Engineer\nnew,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=08c28bec20422...,"Commercial Banking serves global clients, incl...",Plano,TX,0,commercial banking serf global client includin...,commercial banking serves global clients inclu...,commerci bank serv global client includ corpor...,commercial banking serf global client includin...


### Brief Summary of the Job Postings

#### Data Scientist Position

In [8]:
# Which companies hire the most data scientists in TX?
df_ds.company.value_counts().head()

Cognizant Technology Solutions    59
Dell Technologies                 45
Deloitte                          38
Facebook                          38
USAA                              38
Name: company, dtype: int64

In [9]:
# Which cities have the most data scientists postions in TX?
df_ds.city.value_counts().head()

Austin         539
Dallas         312
Houston        243
San Antonio    150
Plano          141
Name: city, dtype: int64

In [10]:
# How does the number of job postings change over time?
df_ds.resample('W').title.count()

date
2020-12-27    392
2021-01-03    136
2021-01-10    212
2021-01-17    187
2021-01-24    317
2021-01-31    280
2021-02-07    233
2021-02-14    142
Freq: W-SUN, Name: title, dtype: int64

In [11]:
# Which company has the best avereage rating? 
df_ds.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
Digital Pharmacist    5.0
Rekruiters            5.0
Levelset              4.7
DRW Trading Group     4.6
Harnham               4.6
Name: company_rating, dtype: float64

#### Web Developer Position

In [12]:
# Which companies hire the most web developers in TX?
df_wd.company.value_counts().head()

CyberCoders                            145
Indeed                                 117
JPMorgan Chase Bank, N.A.              103
Infinity Consulting Solutions, Inc.     53
Cognizant Technology Solutions          49
Name: company, dtype: int64

In [13]:
# Which cities have the most web developers postions in TX?
df_wd.city.value_counts().head(4)

Austin     1107
Dallas      472
Houston     400
Plano       288
Name: city, dtype: int64

In [14]:
# How does the number of job postings change over time?
df_wd.resample('W').title.count()

date
2021-01-10      62
2021-01-17     346
2021-01-24     793
2021-01-31    1054
2021-02-07     796
2021-02-14     493
Freq: W-SUN, Name: title, dtype: int64

In [15]:
# Which company has the best avereage rating? 
df_wd.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
Box                    5.0
ALAANT                 5.0
Rekruiters             5.0
Royal & Ross           5.0
RightNow Ministries    5.0
Name: company_rating, dtype: float64

### Top k Needed Skills

#### Data Scientist Position
- Tech Skills
- Soft Skills
- General Skills (Combination of Tech+Soft)

In [16]:
# Create a tech library
ds_tech_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language']

In [17]:
# Print the number of skills in the library
print("Number of data science skills in tech skill library: ", len(ds_tech_library))

# Print the top 5 needed tech skills

ds_top_tech = MVP_explore.top_skills(df_ds, 5, ds_tech_library)
ds_top_tech

Number of data science skills in tech skill library:  71


Unnamed: 0,frequency
machine learning,2958.0
python,1561.0
sql,1187.0
r,880.0
aws,841.0


In [18]:
# Create a soft skill library
ds_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork'] 

In [19]:
# Print the number of skills in the library
print("Number of data science skills in soft skill library: ", len(ds_soft_library))

# Print the top 5 needed soft skills

ds_top_soft = MVP_explore.top_skills(df_ds, 5, ds_soft_library)
ds_top_soft

Number of data science skills in soft skill library:  20


Unnamed: 0,frequency
communication,1338.0
leadership,822.0
collaboration,401.0
problem solving,298.0
written communication,234.0


In [20]:
# Create a general library
ds_general_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language'
                'critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork']

In [21]:
# Print the number of skills in the library
print("Number of data science skills in general skills library: ", len(ds_general_library))

# Print the top 5 needed general skills

ds_top_general = MVP_explore.top_skills(df_ds, 5, ds_general_library)
ds_top_general

Number of data science skills in general skills library:  90


Unnamed: 0,frequency
machine learning,2958.0
python,1561.0
communication,1338.0
sql,1187.0
r,880.0


#### Web Developer Position
- Tech skills
- Soft skills
- General Skills (Combination of Tech+Soft)

In [22]:
# Create a tech library
wd_tech_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net']  

In [23]:
# Print the number of skills in the library
print("Number of web dev skills in tech skill library: ", len(wd_tech_library))

# Print the top 5 needed tech skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_tech_library)
wd_top_tech

Number of web dev skills in tech skill library:  67


Unnamed: 0,frequency
javascript,2817.0
java,1914.0
sql,1761.0
react,1523.0
net,1372.0


In [24]:
# Create a soft library
wd_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork'] 

In [25]:
# Print the number of skills in the library
print("Number of web dev skills in soft skill library: ", len(wd_soft_library))

# Print the top 5 needed soft skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_soft_library)
wd_top_tech

Number of web dev skills in soft skill library:  20


Unnamed: 0,frequency
communication,2033.0
leadership,759.0
collaboration,495.0
problem solving,429.0
written communication,313.0


In [26]:
# Create a general library
wd_general_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net''critical thinking',
'communication','problem solving','teamwork','ethics','business acumen','interpersonal skills','curiosity',
'storytelling','adaptability','team player','collaboration','time management','leadership','domain knowledge',
'creativity','decision making','verbal communication','written communication','teamwork']

In [27]:
# Print the number of skills in the library
print("Number of web dev skills in general skill library: ", len(wd_general_library))

# Print the top 5 needed general skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_general_library)
wd_top_tech

Number of web dev skills in general skill library:  86


Unnamed: 0,frequency
javascript,2817.0
communication,2033.0
java,1914.0
sql,1761.0
react,1523.0


### Uploads

In [28]:
def upload_to_S3_bucket(file_name, bucket='dspreparedjobpostings', object_name=None):
    """
    Upload a file to an S3 bucket
    
    ***Prepared data files must be in JSON format***

    Parameters
    ----------
    file_name: str
        Name of the file to upload.
    
    bucket: str, default="dspreparedjobpostings"
        S3 Bucket the file will be uploaded to.
    
    object_name: str, default=None
        The file name that will appear in AWS S3 bucket.
        If an object_name is not specified, the file will
        have the same name as the file_name
    
    Returns
    -------
    True or False: bool
        True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [None]:
upload_to_S3_bucket("df_ds_tx_prepared.json")