# MVP

In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# AWS Libraries
import logging
import boto3
from botocore.exceptions import ClientError

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Geo-mapping Libraries
import geopandas
import geopy
import folium 

# Helper Functions
import MVP_acquire_ds, MVP_explore

import warnings
warnings.filterwarnings("ignore")

## Data Acquisition

### Download the JSON Files from AWS S3 Bucket

In [2]:
# Create the s3 resource object
s3 = boto3.resource('s3')

# Download the json files from AWS

s3.Bucket("dspreparedjobpostings").download_file("df_ds_tx_prepared_backup.json", 
                                                 "df_ds_tx_prepared_backup.json")

# Print the last modified datetime of the prepared file of data scientist positions in TX
print("Last modified datetime for data scientist positions in TX: ", 
      s3.Object('dspreparedjobpostings', 'df_ds_tx_prepared_backup.json').last_modified)

s3.Bucket("wdpreparedjobpostings").download_file("df_wd_tx_prepared_backup.json", 
                                                 "df_wd_tx_prepared_backup.json")

# Print the last modified datetime of the prepared file of web developer positions in TX
print("Last modified datetime for web developer positions in TX: ", 
      s3.Object('wdpreparedjobpostings', 'df_wd_tx_prepared_backup.json').last_modified)

Last modified datetime for data scientist positions in TX:  2021-02-25 20:31:40+00:00
Last modified datetime for web developer positions in TX:  2021-02-25 20:34:53+00:00


## Data Preparation

### Load JSON Files into Dataframe and Set the Date as the Index

In [3]:
# Read the json file and convert it to pandas dataframe

df_ds = pd.read_json('df_ds_tx_prepared_backup.json')
df_wd = pd.read_json('df_wd_tx_prepared_backup.json')

# Set the date column as the index and sort the index

df_ds.date = pd.to_datetime(df_ds.date)
df_ds = df_ds.set_index('date').sort_index(ascending=False)
df_wd.date = pd.to_datetime(df_wd.date)
df_wd = df_wd.set_index('date').sort_index(ascending=False)

# Print the number of job postings in each dataframe

print("Number of data scientist job postings in TX: ", df_ds.shape[0])
print("Number of web developer job postings in TX: ", df_wd.shape[0])

Number of data scientist job postings in TX:  2297
Number of web developer job postings in TX:  4470


In [4]:
# Print the concise summary of the dataframe df_ds
df_ds.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2297 entries, 2021-02-25 to 2020-12-22
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            2297 non-null   object 
 1   company          2297 non-null   object 
 2   company_rating   2297 non-null   float64
 3   job_link         2297 non-null   object 
 4   job_description  2297 non-null   object 
 5   city             2297 non-null   object 
 6   state            2297 non-null   object 
 7   zipcode          2297 non-null   int64  
 8   clean            2297 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 179.5+ KB


In [5]:
# Print the concise summary of the dataframe df_wd
df_wd.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4470 entries, 2021-02-25 to 2021-01-04
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            4470 non-null   object 
 1   company          4470 non-null   object 
 2   company_rating   4470 non-null   float64
 3   job_link         4470 non-null   object 
 4   job_description  4470 non-null   object 
 5   city             4470 non-null   object 
 6   state            4470 non-null   object 
 7   zipcode          4470 non-null   int64  
 8   clean            4470 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 349.2+ KB


In [6]:
# Print the first 5 rows of the dataframe df_ds
df_ds.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-02-25,Applied Scientist,Amazon.com Services LLC,3.6,https://www.indeed.com/rc/clk?jk=a9dbd4e1ed934...,"\nM.S. in Computer Science, Machine Learning, ...",Austin,TX,0,m computer science machine learning operationa...
2021-02-25,Applied Data Science Manager,dunnhumby,3.8,https://www.indeed.com/rc/clk?jk=2323bfe22c6e5...,Austin\n\nMost companies try to meet expectati...,Austin,TX,0,austin company try meet expectation dunnhumby ...
2021-02-25,Distinguished Data Scientist,Verizon,3.9,https://www.indeed.com/rc/clk?jk=c1a0bcd827fc1...,When you join Verizon\nVerizon is a leading pr...,Irving,TX,75038,join verizon verizon leading provider technolo...
2021-02-25,Data Engineer,Verizon,3.9,https://www.indeed.com/rc/clk?jk=18f2be848788e...,When you join Verizon\nYou’ll have the power t...,Irving,TX,75038,join verizon youll power go beyond work thats ...
2021-02-25,Developer - Mathematical Optimization,Six Flags Entertainment,3.7,https://www.indeed.com/rc/clk?jk=0d70e0a00cd3a...,Job Duties:\nGather business requirements and ...,Arlington,TX,0,job duty gather business requirement translate...


In [7]:
# Print the first 5 rows of the dataframe df_wd
df_wd.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-02-25,WEB DEVELOPER INTERN,Blanks Financial Solutions,0.0,https://www.indeed.com/rc/clk?jk=9a84b22936691...,The position\n\n\nWe are looking for a Full St...,Dallas,TX,75238,position looking full stack developer intern e...
2021-02-25,Software Engineer (Full Stack),Lightspeed Systems,3.9,https://www.indeed.com/company/Lightspeed-Syst...,Are you a highly experienced Full-stack JavaSc...,Austin,TX,0,highly experienced fullstack javascript softwa...
2021-02-25,Product Security Engineer II,Box,5.0,https://www.indeed.com/rc/clk?jk=b6f14ed796fe6...,WHAT IS BOX?\nBox is the market leader for Clo...,0,TX,0,box box market leader cloud content management...
2021-02-25,Full Stack Software Engineer,Onit,4.7,https://www.indeed.com/rc/clk?jk=0e035ff4a68a0...,"As a Full Stack Developer at Onit, you will he...",Houston,TX,77056,full stack developer onit help design develop ...
2021-02-25,Software Engineer,Abrigo,3.0,https://www.indeed.com/rc/clk?jk=5d90dda1355d0...,We provide technology that community financial...,Austin,TX,78727,provide technology community financial institu...


### Brief Summary of the Job Postings

#### Data Scientist Position

In [8]:
# Which companies hire the most data scientists in TX?
df_ds.company.value_counts().head()

Cognizant Technology Solutions    63
USAA                              55
Facebook                          51
Deloitte                          48
Dell Technologies                 46
Name: company, dtype: int64

In [9]:
# Which cities have the most data scientists postions in TX?
df_ds.city.value_counts().head()

Austin         674
Dallas         377
Houston        284
Plano          183
San Antonio    178
Name: city, dtype: int64

In [10]:
# How does the number of job postings change over time?
df_ds.resample('W').title.count()

date
2020-12-27    392
2021-01-03    136
2021-01-10    212
2021-01-17    187
2021-01-24    352
2021-01-31    294
2021-02-07    262
2021-02-14    274
2021-02-21    123
2021-02-28     65
Freq: W-SUN, Name: title, dtype: int64

In [11]:
# Which company has the best avereage rating? 
df_ds.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
Rekruiters            5.0
Digital Pharmacist    5.0
Onit                  4.7
Levelset              4.7
Harnham               4.6
Name: company_rating, dtype: float64

In [12]:
df_ds.title.value_counts().head()

Data Scientist               236
Senior Data Scientist         84
Machine Learning Engineer     53
Senior Data Analyst           52
Sr. Data Scientist            39
Name: title, dtype: int64

#### Web Developer Position

In [13]:
# Which companies hire the most web developers in TX?
df_wd.company.value_counts().head()

CyberCoders                            176
JPMorgan Chase Bank, N.A.              129
Indeed                                 125
Infinity Consulting Solutions, Inc.     72
Cognizant Technology Solutions          70
Name: company, dtype: int64

In [14]:
# Which cities have the most web developers postions in TX?
df_wd.city.value_counts().head(4)

Austin     1381
Dallas      623
Houston     502
Plano       356
Name: city, dtype: int64

In [15]:
# How does the number of job postings change over time?
df_wd.resample('W').title.count()

date
2021-01-10      62
2021-01-17     346
2021-01-24     793
2021-01-31    1054
2021-02-07     825
2021-02-14     785
2021-02-21     377
2021-02-28     228
Freq: W-SUN, Name: title, dtype: int64

In [16]:
# Which company has the best avereage rating? 
df_wd.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
Box                    5.0
Givelify               5.0
RightNow Ministries    5.0
The Evolvers Group     5.0
MarketScale            5.0
Name: company_rating, dtype: float64

In [17]:
df_wd.title.value_counts().head()

Software Engineer           98
Web Developer               90
Senior Software Engineer    77
Software Developer          76
Full Stack Developer        63
Name: title, dtype: int64

### Top k Needed Skills

#### Data Scientist Position
- Tech Skills
- Soft Skills
- General Skills (Combination of Tech+Soft)

In [18]:
# Create a tech library
ds_tech_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language']

In [19]:
# Print the number of skills in the library
print("Number of data science skills in tech skill library: ", len(ds_tech_library))

# Print the top 5 needed tech skills

ds_top_tech = MVP_explore.top_skills(df_ds, 5, ds_tech_library, 'tech')
ds_top_tech

Number of data science skills in tech skill library:  71
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
ds


Unnamed: 0,top5_tech_skills,frequency
0,machine learning,3501.0
1,python,1844.0
2,sql,1412.0
3,aws,1074.0
4,r,1019.0


In [20]:
# Create a soft skill library
ds_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork']

In [21]:
# Print the number of skills in the library
print("Number of data science skills in soft skill library: ", len(ds_soft_library))

# Print the top 5 needed soft skills

ds_top_soft = MVP_explore.top_skills(df_ds, 5, ds_soft_library, 'soft')
ds_top_soft

Number of data science skills in soft skill library:  20
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
ds


Unnamed: 0,top5_soft_skills,frequency
0,communication,1579.0
1,leadership,975.0
2,collaboration,475.0
3,problem solving,351.0
4,written communication,262.0


In [22]:
# Create a general library
ds_general_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language'
                'critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork']

In [23]:
# Print the number of skills in the library
print("Number of data science skills in general skills library: ", len(ds_general_library))

# Print the top 5 needed general skills

ds_top_general = MVP_explore.top_skills(df_ds, 5, ds_general_library, 'general')
ds_top_general

Number of data science skills in general skills library:  90
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
ds


Unnamed: 0,top5_general_skills,frequency
0,machine learning,3501.0
1,python,1844.0
2,communication,1579.0
3,sql,1412.0
4,aws,1074.0


#### Web Developer Position
- Tech skills
- Soft skills
- General Skills (Combination of Tech+Soft)

In [24]:
# Create a tech library
wd_tech_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net']  

In [25]:
# Print the number of skills in the library
print("Number of web dev skills in tech skill library: ", len(wd_tech_library))

# Print the top 5 needed tech skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_tech_library, 'tech')
wd_top_tech

Number of web dev skills in tech skill library:  67
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
wd


Unnamed: 0,top5_tech_skills,frequency
0,javascript,3577.0
1,java,2394.0
2,sql,2148.0
3,react,1928.0
4,c,1729.0


In [26]:
# Create a soft library
wd_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork'] 

In [27]:
# Print the number of skills in the library
print("Number of web dev skills in soft skill library: ", len(wd_soft_library))

# Print the top 5 needed soft skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_soft_library, 'soft')
wd_top_tech

Number of web dev skills in soft skill library:  20
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
wd


Unnamed: 0,top5_soft_skills,frequency
0,communication,2602.0
1,leadership,1020.0
2,collaboration,669.0
3,problem solving,548.0
4,written communication,405.0


In [28]:
# Create a general library
wd_general_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net''critical thinking',
'communication','problem solving','teamwork','ethics','business acumen','interpersonal skills','curiosity',
'storytelling','adaptability','team player','collaboration','time management','leadership','domain knowledge',
'creativity','decision making','verbal communication','written communication','teamwork']

In [29]:
# Print the number of skills in the library
print("Number of web dev skills in general skill library: ", len(wd_general_library))

# Print the top 5 needed general skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_general_library, 'general')
wd_top_tech

Number of web dev skills in general skill library:  86
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
wd


Unnamed: 0,top5_general_skills,frequency
0,javascript,3577.0
1,communication,2602.0
2,java,2394.0
3,sql,2148.0
4,react,1928.0


### Top 5 Skills Function