# MVP

In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# AWS Libraries
import logging
import boto3
from botocore.exceptions import ClientError

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Geo-mapping Libraries
import geopandas
import geopy
import folium 

# Helper Functions
import MVP_acquire_ds, MVP_explore

import warnings
warnings.filterwarnings("ignore")

## Data Acquisition

### Download the JSON Files from AWS S3 Bucket

In [8]:
# Create the s3 resource object
s3 = boto3.resource('s3')

# Download the json files from AWS

s3.Bucket("dspreparedjobpostings").download_file("df_ds_tx_prepared_backup.json", 
                                                 "df_ds_tx_prepared_backup.json")

# Print the last modified datetime of the prepared file of data scientist positions in TX
print("Last modified datetime for data scientist positions in TX: ", 
      s3.Object('dspreparedjobpostings', 'df_ds_tx_prepared_backup.json').last_modified)

s3.Bucket("wdpreparedjobpostings").download_file("df_wd_tx_prepared_backup.json", 
                                                 "df_wd_tx_prepared_backup.json")

# Print the last modified datetime of the prepared file of web developer positions in TX
print("Last modified datetime for web developer positions in TX: ", 
      s3.Object('wdpreparedjobpostings', 'df_wd_tx_prepared_backup.json').last_modified)

Last modified datetime for data scientist positions in TX:  2021-02-22 15:49:42+00:00
Last modified datetime for web developer positions in TX:  2021-02-22 20:34:22+00:00


## Data Preparation

### Load JSON Files into Dataframe and Set the Date as the Index

In [2]:
# Read the json file and convert it to pandas dataframe

df_ds = pd.read_json('df_ds_tx_prepared_backup.json')
df_wd = pd.read_json('df_wd_tx_prepared_backup.json')

# Set the date column as the index and sort the index

df_ds.date = pd.to_datetime(df_ds.date)
df_ds = df_ds.set_index('date').sort_index(ascending=False)
df_wd.date = pd.to_datetime(df_wd.date)
df_wd = df_wd.set_index('date').sort_index(ascending=False)

# Print the number of job postings in each dataframe

print("Number of data scientist job postings in TX: ", df_ds.shape[0])
print("Number of web developer job postings in TX: ", df_wd.shape[0])

Number of data scientist job postings in TX:  2188
Number of web developer job postings in TX:  4177


In [10]:
# Print the concise summary of the dataframe df_ds
df_ds.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2188 entries, 2021-02-22 to 2020-12-22
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            2188 non-null   object 
 1   company          2188 non-null   object 
 2   company_rating   2188 non-null   float64
 3   job_link         2188 non-null   object 
 4   job_description  2188 non-null   object 
 5   city             2188 non-null   object 
 6   state            2188 non-null   object 
 7   zipcode          2188 non-null   int64  
 8   clean            2188 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 170.9+ KB


In [11]:
# Print the concise summary of the dataframe df_wd
df_wd.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4177 entries, 2021-02-22 to 2021-01-04
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            4177 non-null   object 
 1   company          4177 non-null   object 
 2   company_rating   4177 non-null   float64
 3   job_link         4177 non-null   object 
 4   job_description  4177 non-null   object 
 5   city             4177 non-null   object 
 6   state            4177 non-null   object 
 7   zipcode          4177 non-null   int64  
 8   clean            4177 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 326.3+ KB


In [12]:
# Print the first 5 rows of the dataframe df_ds
df_ds.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-02-22,Machine Learning & Data Engineer,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=b3b1aca86cce5...,Corporate Banking Technology is hiring a multi...,Plano,TX,0,corporate banking technology hiring multiskill...
2021-02-22,Data Scientist Senior - Computer Vision/Deep L...,USAA,3.9,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Purpose of Job We are currently seeking a tale...,Leming,TX,78050,purpose job currently seeking talented data sc...
2021-02-22,"Director, Decision Science Analytics - Propert...",USAA,3.9,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Purpose of Job USAA is looking for an experien...,Rio Medina,TX,78066,purpose job usaa looking experienced decision ...
2021-02-22,Senior Catastrophe Modeling Analyst,USAA,3.9,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Purpose of Job We are currently seeking a Seni...,Helotes,TX,78023,purpose job currently seeking senior catastrop...
2021-02-22,Lead Decision Science Analyst – AML (Remote Wo...,USAA,3.9,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Purpose of Job We are currently seeking a tale...,San Antonio,TX,78206,purpose job currently seeking talented decisio...


In [13]:
# Print the first 5 rows of the dataframe df_wd
df_wd.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-02-22,Senior C#/.Net Product Developer,Gainwell Technologies,0.0,https://www.indeed.com/rc/clk?jk=4405a5112f08d...,Job Description:\nEssential Job Functions\nDes...,0,TX,0,job description essential job function design ...
2021-02-22,Salesforce Developer,Guidehouse,3.4,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Overview\nGuidehouse is a leading management c...,Austin,TX,78701,overview guidehouse leading management consult...
2021-02-22,Cyber Security Engineer,Paysafe Group,3.6,https://www.indeed.com/rc/clk?jk=b92b0c0fa5f83...,Paysafe Group (Paysafe) is a leading specialis...,Security,TX,0,paysafe group paysafe leading specialised paym...
2021-02-22,Java Software Engineer-Meta Data Management,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=c6a3494819249...,As an experienced member of our Software Engin...,Plano,TX,0,experienced member software engineering group ...
2021-02-22,CMS Web Developer,D.R. Horton,3.4,https://www.indeed.com/rc/clk?jk=3c87c2ce9a3e3...,"D.R. Horton, Inc., the largest homebuilder in ...",Arlington,TX,0,dr horton inc largest homebuilder u founded 19...


### Brief Summary of the Job Postings

#### Data Scientist Position

In [14]:
# Which companies hire the most data scientists in TX?
df_ds.company.value_counts().head()

Cognizant Technology Solutions    63
Facebook                          49
USAA                              46
Dell Technologies                 46
Deloitte                          45
Name: company, dtype: int64

In [15]:
# Which cities have the most data scientists postions in TX?
df_ds.city.value_counts().head()

Austin         635
Dallas         361
Houston        275
Plano          175
San Antonio    169
Name: city, dtype: int64

In [16]:
# How does the number of job postings change over time?
df_ds.resample('W').title.count()

date
2020-12-27    392
2021-01-03    136
2021-01-10    212
2021-01-17    187
2021-01-24    352
2021-01-31    292
2021-02-07    255
2021-02-14    254
2021-02-21    101
2021-02-28      7
Freq: W-SUN, Name: title, dtype: int64

In [17]:
# Which company has the best avereage rating? 
df_ds.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
Digital Pharmacist    5.0
Rekruiters            5.0
Levelset              4.7
Onit                  4.7
Harnham               4.6
Name: company_rating, dtype: float64

In [18]:
df_ds.title.value_counts().head()

Data Scientist               230
Senior Data Scientist         83
Machine Learning Engineer     52
Senior Data Analyst           51
Sr. Data Scientist            38
Name: title, dtype: int64

#### Web Developer Position

In [19]:
# Which companies hire the most web developers in TX?
df_wd.company.value_counts().head()

CyberCoders                            170
Indeed                                 124
JPMorgan Chase Bank, N.A.              119
Infinity Consulting Solutions, Inc.     67
Cognizant Technology Solutions          62
Name: company, dtype: int64

In [20]:
# Which cities have the most web developers postions in TX?
df_wd.city.value_counts().head(4)

Austin     1302
Dallas      577
Houston     472
Plano       339
Name: city, dtype: int64

In [21]:
# How does the number of job postings change over time?
df_wd.resample('W').title.count()

date
2021-01-10      62
2021-01-17     346
2021-01-24     793
2021-01-31    1054
2021-02-07     825
2021-02-14     761
2021-02-21     321
2021-02-28      15
Freq: W-SUN, Name: title, dtype: int64

In [22]:
# Which company has the best avereage rating? 
df_wd.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
The Evolvers Group    5.0
ALAANT                5.0
Box                   5.0
PEAKE                 5.0
Givelify              5.0
Name: company_rating, dtype: float64

In [23]:
df_wd.title.value_counts().head()

Software Engineer           92
Web Developer               77
Senior Software Engineer    73
Software Developer          71
Full Stack Developer        63
Name: title, dtype: int64

### Top k Needed Skills

#### Data Scientist Position
- Tech Skills
- Soft Skills
- General Skills (Combination of Tech+Soft)

In [3]:
# Create a tech library
ds_tech_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language']

In [19]:
# Print the number of skills in the library
print("Number of data science skills in tech skill library: ", len(ds_tech_library))

# Print the top 5 needed tech skills

ds_top_tech = MVP_explore.top_skills(df_ds, 5, ds_tech_library, 'tech')
ds_top_tech

Number of data science skills in tech skill library:  71


Unnamed: 0,top5_tech_skills,frequency
0,machine learning,3326.0
1,python,1759.0
2,sql,1352.0
3,aws,1019.0
4,r,979.0


In [5]:
# Create a soft skill library
ds_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork']

In [6]:
# Print the number of skills in the library
print("Number of data science skills in soft skill library: ", len(ds_soft_library))

# Print the top 5 needed soft skills

ds_top_soft = MVP_explore.top_skills(df_ds, 5, ds_soft_library, 'soft')
ds_top_soft

Number of data science skills in soft skill library:  20


Unnamed: 0,top5_soft_skills,frequency
0,communication,1510.0
1,leadership,940.0
2,collaboration,450.0
3,problem solving,335.0
4,written communication,253.0


In [7]:
# Create a general library
ds_general_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language'
                'critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork']

In [20]:
# Print the number of skills in the library
print("Number of data science skills in general skills library: ", len(ds_general_library))

# Print the top 5 needed general skills

ds_top_general = MVP_explore.top_skills(df_ds, 5, ds_general_library, 'general')
ds_top_general

Number of data science skills in general skills library:  90


Unnamed: 0,top5_general_skills,frequency
0,machine learning,3326.0
1,python,1759.0
2,communication,1510.0
3,sql,1352.0
4,aws,1019.0


#### Web Developer Position
- Tech skills
- Soft skills
- General Skills (Combination of Tech+Soft)

In [10]:
# Create a tech library
wd_tech_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net']  

In [12]:
# Print the number of skills in the library
print("Number of web dev skills in tech skill library: ", len(wd_tech_library))

# Print the top 5 needed tech skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_tech_library, 'tech')
wd_top_tech

Number of web dev skills in tech skill library:  67


Unnamed: 0,top5_tech_skills,frequency
0,javascript,3363.0
1,java,2265.0
2,sql,2049.0
3,react,1808.0
4,net,1619.0


In [13]:
# Create a soft library
wd_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork'] 

In [14]:
# Print the number of skills in the library
print("Number of web dev skills in soft skill library: ", len(wd_soft_library))

# Print the top 5 needed soft skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_soft_library, 'soft')
wd_top_tech

Number of web dev skills in soft skill library:  20


Unnamed: 0,top5_soft_skills,frequency
0,communication,2423.0
1,leadership,950.0
2,collaboration,608.0
3,problem solving,513.0
4,written communication,372.0


In [15]:
# Create a general library
wd_general_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net''critical thinking',
'communication','problem solving','teamwork','ethics','business acumen','interpersonal skills','curiosity',
'storytelling','adaptability','team player','collaboration','time management','leadership','domain knowledge',
'creativity','decision making','verbal communication','written communication','teamwork']

In [16]:
# Print the number of skills in the library
print("Number of web dev skills in general skill library: ", len(wd_general_library))

# Print the top 5 needed general skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_general_library, 'general')
wd_top_tech

Number of web dev skills in general skill library:  86


Unnamed: 0,top5_general_skills,frequency
0,javascript,3363.0
1,communication,2423.0
2,java,2265.0
3,sql,2049.0
4,react,1808.0
