# MVP

In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# AWS Libraries
import logging
import boto3
from botocore.exceptions import ClientError

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Geo-mapping Libraries
import geopandas
import geopy
import folium 

# Helper Functions
import MVP_acquire_ds, MVP_explore

import warnings
warnings.filterwarnings("ignore")

## Data Acquisition

### Download the JSON Files from AWS S3 Bucket

In [2]:
# Create the s3 resource object
s3 = boto3.resource('s3')

# Download the json files from AWS

s3.Bucket("dspreparedjobpostings").download_file("df_ds_tx_prepared_backup.json", 
                                                 "df_ds_tx_prepared_backup.json")

# Print the last modified datetime of the prepared file of data scientist positions in TX
print("Last modified datetime for data scientist positions in TX: ", 
      s3.Object('dspreparedjobpostings', 'df_ds_tx_prepared_backup.json').last_modified)

s3.Bucket("wdpreparedjobpostings").download_file("df_wd_tx_prepared_backup.json", 
                                                 "df_wd_tx_prepared_backup.json")

# Print the last modified datetime of the prepared file of web developer positions in TX
print("Last modified datetime for web developer positions in TX: ", 
      s3.Object('wdpreparedjobpostings', 'df_wd_tx_prepared_backup.json').last_modified)

Last modified datetime for data scientist positions in TX:  2021-02-23 17:42:45+00:00
Last modified datetime for web developer positions in TX:  2021-02-23 17:43:26+00:00


## Data Preparation

### Load JSON Files into Dataframe and Set the Date as the Index

In [2]:
# Read the json file and convert it to pandas dataframe

df_ds = pd.read_json('df_ds_tx_prepared_backup.json')
df_wd = pd.read_json('df_wd_tx_prepared_backup.json')

# Set the date column as the index and sort the index

df_ds.date = pd.to_datetime(df_ds.date)
df_ds = df_ds.set_index('date').sort_index(ascending=False)
df_wd.date = pd.to_datetime(df_wd.date)
df_wd = df_wd.set_index('date').sort_index(ascending=False)

# Print the number of job postings in each dataframe

print("Number of data scientist job postings in TX: ", df_ds.shape[0])
print("Number of web developer job postings in TX: ", df_wd.shape[0])

Number of data scientist job postings in TX:  2234
Number of web developer job postings in TX:  4285


In [3]:
# Print the concise summary of the dataframe df_ds
df_ds.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 2234 entries, 2021-02-23 to 2020-12-22
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            2234 non-null   object 
 1   company          2234 non-null   object 
 2   company_rating   2234 non-null   float64
 3   job_link         2234 non-null   object 
 4   job_description  2234 non-null   object 
 5   city             2234 non-null   object 
 6   state            2234 non-null   object 
 7   zipcode          2234 non-null   int64  
 8   clean            2234 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 174.5+ KB


In [4]:
# Print the concise summary of the dataframe df_wd
df_wd.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 4285 entries, 2021-02-23 to 2021-01-04
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            4285 non-null   object 
 1   company          4285 non-null   object 
 2   company_rating   4285 non-null   float64
 3   job_link         4285 non-null   object 
 4   job_description  4285 non-null   object 
 5   city             4285 non-null   object 
 6   state            4285 non-null   object 
 7   zipcode          4285 non-null   int64  
 8   clean            4285 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 334.8+ KB


In [6]:
# Print the first 5 rows of the dataframe df_ds
df_ds.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-02-23,Senior Data Scientist - ProServe,"Amazon Web Services, Inc.",3.6,https://www.indeed.com/rc/clk?jk=244b52fdf1a25...,\nBachelor’s degree in a highly quantitative f...,San Antonio,TX,0,bachelor degree highly quantitative field comp...
2021-02-23,Supply Chain Data Analyst,EthosEnergy Group,3.3,https://www.indeed.com/rc/clk?jk=83cbe651fdfb2...,Job Overview\nThe Supply Chain Data Analyst se...,Houston,TX,77092,job overview supply chain data analyst seek co...
2021-02-23,"Sr. Technical Product Manager, Digital Machine...",Apple,4.2,https://www.indeed.com/rc/clk?jk=c62329422529e...,"Summary\nPosted: Feb 22, 2021\nWeekly Hours: 4...",Austin,TX,0,summary posted feb 22 2021 weekly hour 40 role...
2021-02-23,Associate Data Scientist,Southwest Airlines Co.,4.3,https://www.indeed.com/rc/clk?jk=ca6fb6db72448...,Overview:\n\nSouthwest will provide a stable w...,Dallas,TX,75235,overview southwest provide stable work environ...
2021-02-23,Senior Data Scientist,Hewlett Packard Enterprise,3.8,https://www.indeed.com/rc/clk?jk=6cfebd31701b4...,Hewlett Packard Enterprise (HPE) advances the ...,Austin,TX,78758,hewlett packard enterprise hpe advance way peo...


In [7]:
# Print the first 5 rows of the dataframe df_wd
df_wd.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2021-02-23,UI AngularJS Bootstrap Developer,iboss,3.2,https://www.indeed.com/rc/clk?jk=7b562f4350e26...,Company Overview\niboss is a cloud security co...,Austin,TX,0,company overview iboss cloud security company ...
2021-02-23,Senior UI Developer – React,Cognizant Technology Solutions,0.0,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Senior UI Developer – React Cognizant Interact...,Dallas-Fort Worth,TX,0,senior ui developer react cognizant interactiv...
2021-02-23,Security Engineer,Box,5.0,https://www.indeed.com/rc/clk?jk=de94237091d9c...,WHAT IS BOX?\nBox is the market leader for Clo...,Austin,TX,0,box box market leader cloud content management...
2021-02-23,Behavioral Health Therapist PHP - Full Time Da...,Texas Health Resources,3.9,https://www.indeed.com/rc/clk?jk=f010cad109fdd...,Texas Health H-E-B seeks to hire a Behavioral ...,Bedford,TX,76022,texas health heb seek hire behavioral health t...
2021-02-23,Full Stack Java Software Engineer,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=106f30609958b...,"Commercial Banking serves global clients, incl...",Plano,TX,0,commercial banking serf global client includin...


### Brief Summary of the Job Postings

#### Data Scientist Position

In [8]:
# Which companies hire the most data scientists in TX?
df_ds.company.value_counts().head()

Cognizant Technology Solutions    63
Facebook                          50
USAA                              50
Dell Technologies                 46
Deloitte                          46
Name: company, dtype: int64

In [9]:
# Which cities have the most data scientists postions in TX?
df_ds.city.value_counts().head()

Austin         653
Dallas         367
Houston        279
Plano          180
San Antonio    171
Name: city, dtype: int64

In [10]:
# How does the number of job postings change over time?
df_ds.resample('W').title.count()

date
2020-12-27    392
2021-01-03    136
2021-01-10    212
2021-01-17    187
2021-01-24    352
2021-01-31    292
2021-02-07    259
2021-02-14    263
2021-02-21    111
2021-02-28     30
Freq: W-SUN, Name: title, dtype: int64

In [11]:
# Which company has the best avereage rating? 
df_ds.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
Rekruiters            5.0
Digital Pharmacist    5.0
Onit                  4.7
Levelset              4.7
Harnham               4.6
Name: company_rating, dtype: float64

In [12]:
df_ds.title.value_counts().head()

Data Scientist               231
Senior Data Scientist         84
Machine Learning Engineer     53
Senior Data Analyst           52
Sr. Data Scientist            38
Name: title, dtype: int64

#### Web Developer Position

In [13]:
# Which companies hire the most web developers in TX?
df_wd.company.value_counts().head()

CyberCoders                            176
Indeed                                 124
JPMorgan Chase Bank, N.A.              121
Infinity Consulting Solutions, Inc.     71
Cognizant Technology Solutions          65
Name: company, dtype: int64

In [14]:
# Which cities have the most web developers postions in TX?
df_wd.city.value_counts().head(4)

Austin     1331
Dallas      599
Houston     479
Plano       346
Name: city, dtype: int64

In [15]:
# How does the number of job postings change over time?
df_wd.resample('W').title.count()

date
2021-01-10      62
2021-01-17     346
2021-01-24     793
2021-01-31    1054
2021-02-07     825
2021-02-14     772
2021-02-21     347
2021-02-28      86
Freq: W-SUN, Name: title, dtype: int64

In [16]:
# Which company has the best avereage rating? 
df_wd.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
The Evolvers Group     5.0
Royal & Ross           5.0
MarketScale            5.0
RightNow Ministries    5.0
PEAKE                  5.0
Name: company_rating, dtype: float64

In [17]:
df_wd.title.value_counts().head()

Software Engineer           92
Web Developer               79
Senior Software Engineer    75
Software Developer          72
Full Stack Developer        63
Name: title, dtype: int64

### Top k Needed Skills

#### Data Scientist Position
- Tech Skills
- Soft Skills
- General Skills (Combination of Tech+Soft)

In [5]:
# Create a tech library
ds_tech_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language']

In [6]:
# Print the number of skills in the library
print("Number of data science skills in tech skill library: ", len(ds_tech_library))

# Print the top 5 needed tech skills

ds_top_tech = MVP_explore.top_skills(df_ds, 5, ds_tech_library, 'tech')
ds_top_tech

Number of data science skills in tech skill library:  71
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
ds


Unnamed: 0,top5_tech_skills,frequency
0,machine learning,3392.0
1,python,1798.0
2,sql,1379.0
3,aws,1040.0
4,r,1000.0


In [7]:
# Create a soft skill library
ds_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork']

In [8]:
# Print the number of skills in the library
print("Number of data science skills in soft skill library: ", len(ds_soft_library))

# Print the top 5 needed soft skills

ds_top_soft = MVP_explore.top_skills(df_ds, 5, ds_soft_library, 'soft')
ds_top_soft

Number of data science skills in soft skill library:  20
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
ds


Unnamed: 0,top5_soft_skills,frequency
0,communication,1542.0
1,leadership,951.0
2,collaboration,455.0
3,problem solving,344.0
4,written communication,256.0


In [9]:
# Create a general library
ds_general_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language'
                'critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork']

In [10]:
# Print the number of skills in the library
print("Number of data science skills in general skills library: ", len(ds_general_library))

# Print the top 5 needed general skills

ds_top_general = MVP_explore.top_skills(df_ds, 5, ds_general_library, 'general')
ds_top_general

Number of data science skills in general skills library:  90
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
ds


Unnamed: 0,top5_general_skills,frequency
0,machine learning,3392.0
1,python,1798.0
2,communication,1542.0
3,sql,1379.0
4,aws,1040.0


#### Web Developer Position
- Tech skills
- Soft skills
- General Skills (Combination of Tech+Soft)

In [11]:
# Create a tech library
wd_tech_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net']  

In [12]:
# Print the number of skills in the library
print("Number of web dev skills in tech skill library: ", len(wd_tech_library))

# Print the top 5 needed tech skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_tech_library, 'tech')
wd_top_tech

Number of web dev skills in tech skill library:  67
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
wd


Unnamed: 0,top5_tech_skills,frequency
0,javascript,3438.0
1,java,2316.0
2,sql,2090.0
3,react,1849.0
4,net,1657.0


In [13]:
# Create a soft library
wd_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork'] 

In [14]:
# Print the number of skills in the library
print("Number of web dev skills in soft skill library: ", len(wd_soft_library))

# Print the top 5 needed soft skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_soft_library, 'soft')
wd_top_tech

Number of web dev skills in soft skill library:  20
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
wd


Unnamed: 0,top5_soft_skills,frequency
0,communication,2488.0
1,leadership,982.0
2,collaboration,623.0
3,problem solving,533.0
4,written communication,384.0


In [15]:
# Create a general library
wd_general_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net''critical thinking',
'communication','problem solving','teamwork','ethics','business acumen','interpersonal skills','curiosity',
'storytelling','adaptability','team player','collaboration','time management','leadership','domain knowledge',
'creativity','decision making','verbal communication','written communication','teamwork']

In [16]:
# Print the number of skills in the library
print("Number of web dev skills in general skill library: ", len(wd_general_library))

# Print the top 5 needed general skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_general_library, 'general')
wd_top_tech

Number of web dev skills in general skill library:  86
Do you want to save the dataframe as JSON and upload to AWS? (Y/N)
Y
Enter the INITIALS of the job title:
wd


Unnamed: 0,top5_general_skills,frequency
0,javascript,3438.0
1,communication,2488.0
2,java,2316.0
3,sql,2090.0
4,react,1849.0
