# MVP

In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# AWS Libraries
import logging
import boto3
from botocore.exceptions import ClientError

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Geo-mapping Libraries
import geopandas
import geopy
import folium 

# Helper Functions
import MVP_acquire_ds, MVP_explore

import warnings
warnings.filterwarnings("ignore")

## Data Acquisition

### Download the JSON Files from AWS S3 Bucket

In [2]:
# Create the s3 resource object
s3 = boto3.resource('s3')

# Download the json files from AWS

s3.Bucket("dspreparedjobpostings").download_file("df_ds_tx_prepared_backup.json", 
                                                 "df_ds_tx_prepared_backup.json")
s3.Bucket("wdpreparedjobpostings").download_file("df_wd_tx_prepared_backup.json", 
                                                 "df_wd_tx_prepared_backup.json")

## Data Preparation

### Load JSON Files into Dataframe and Set the Date as the Index

In [3]:
# Read the json file and convert it to pandas dataframe

df_ds = pd.read_json('df_ds_tx_prepared_backup.json')
df_wd = pd.read_json('df_wd_tx_prepared_backup.json')

# Set the date column as the index and sort the index

df_ds.date = pd.to_datetime(df_ds.date)
df_ds = df_ds.set_index('date').sort_index(ascending=False)
df_wd.date = pd.to_datetime(df_wd.date)
df_wd = df_wd.set_index('date').sort_index(ascending=False)

# Print the number of job postings in each dataframe

print("Number of data scientist job postings in TX: ", df_ds.shape[0])
print("Number of web developer job postings in TX: ", df_wd.shape[0])

Number of data scientist job postings in TX:  1785
Number of web developer job postings in TX:  3190


In [4]:
# Print the concise summary of the dataframe df_ds
df_ds.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1785 entries, 2021-02-12 to 2020-12-22
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            1785 non-null   object 
 1   company          1785 non-null   object 
 2   company_rating   1785 non-null   float64
 3   job_link         1785 non-null   object 
 4   job_description  1785 non-null   object 
 5   city             1785 non-null   object 
 6   state            1785 non-null   object 
 7   zipcode          1785 non-null   int64  
 8   clean            1785 non-null   object 
 9   tokenized        1785 non-null   object 
 10  stemmed          1785 non-null   object 
 11  lemmatized       1785 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 181.3+ KB


In [5]:
# Print the concise summary of the dataframe df_wd
df_wd.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3190 entries, 2021-02-11 to 2021-01-04
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            3190 non-null   object 
 1   company          3190 non-null   object 
 2   company_rating   3190 non-null   float64
 3   job_link         3190 non-null   object 
 4   job_description  3190 non-null   object 
 5   city             3190 non-null   object 
 6   state            3190 non-null   object 
 7   zipcode          3190 non-null   int64  
 8   clean            3190 non-null   object 
 9   tokenized        3190 non-null   object 
 10  stemmed          3190 non-null   object 
 11  lemmatized       3190 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 324.0+ KB


In [6]:
# Print the first 5 rows of the dataframe df_ds
df_ds.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean,tokenized,stemmed,lemmatized
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-12,Data Science Product Owner\nnew,Verizon,3.9,https://www.indeed.com/rc/clk?jk=0f466d3515fc2...,When you join Verizon\nVerizon is a leading pr...,Irving,TX,75038,join verizon verizon leading provider technolo...,when you join verizon\nverizon is a leading pr...,when you join verizon verizon is a lead provid...,when you join verizon verizon is a leading pro...
2021-02-12,Data Scientist Sr. Associate\nnew,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=8c3aa56c31f7b...,The Data Scientist is an individual contributo...,Lewisville,TX,0,data scientist individual contributor able app...,the data scientist is an individual contributo...,the data scientist is an individu contributor ...,the data scientist is an individual contributo...
2021-02-12,Data Scientist- LCT Analytics (Product Develop...,Blue Yonder,4.4,https://www.indeed.com/rc/clk?jk=2d96d697ed4bc...,Role: Data Scientist\nTeam : Product Developme...,Dallas,TX,0,role data scientist team product developmentlc...,role data scientist\nteam product developmentl...,role data scientist team product developmentlc...,role data scientist team product developmentlc...
2021-02-12,Data Scientist\nnew,Integrative Emergency Services,0.0,https://www.indeed.com/rc/clk?jk=842f40ce8bde2...,"Mission\nIntegrative Emergency Services, LLC (...",Fort Worth,TX,76104,mission integrative emergency service llc y de...,mission\nintegrative emergency services llc ie...,mission integr emerg servic llc i is dedic to ...,mission integrative emergency service llc y is...
2021-02-12,Data Scientist\nnew,CDK Global,3.2,https://www.indeed.com/rc/clk?jk=8c0b900116dec...,Accelerate Your Career\nDrive global technolog...,Austin,TX,78730,accelerate career drive global technology 2 bi...,accelerate your career\ndrive global technolog...,acceler your career drive global technolog wit...,accelerate your career drive global technology...


In [7]:
# Print the first 5 rows of the dataframe df_wd
df_wd.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean,tokenized,stemmed,lemmatized
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-11,Front-End Web Developer (Wordpress/Shopify)\nnew,Explore Digital,0.0,https://www.indeed.com/pagead/clk?mo=r&ad=-6NY...,Who We're Looking ForA highly skilled Full-Tim...,0,TX,0,looking forum highly skilled fulltime web deve...,who were looking fora highly skilled fulltime ...,who were look fora highli skill fulltim web de...,who were looking forum highly skilled fulltime...
2021-02-11,Web Application Developer\nnew,Talenti Qo Workforce and RPO Services PVT LTD.,0.0,https://www.indeed.com/company/Talenti-Qo-Work...,US Citizen & Green-card Holder onlyAbout This ...,Dallas-Fort Worth,TX,0,u citizen greencard holder onlyabout position ...,us citizen greencard holder onlyabout this pos...,us citizen greencard holder onlyabout thi posi...,u citizen greencard holder onlyabout this posi...
2021-02-11,UI Software Engineer (Web React & Desktop Pyth...,"JPMorgan Chase Bank, N.A.",3.9,https://www.indeed.com/rc/clk?jk=de2fe567a395e...,We are searching for highly motivated front-en...,Houston,TX,0,searching highly motivated frontend software e...,we are searching for highly motivated frontend...,we are search for highli motiv frontend softwa...,we are searching for highly motivated frontend...
2021-02-11,Sr File Maker developer\nnew,Nava Software Solutions LLC,0.0,https://www.indeed.com/company/Nava-Software-S...,Sr File Maker developerLocation: Houston TX(On...,Houston,TX,0,sr file maker developerlocation houston txonly...,sr file maker developerlocation houston txonly...,sr file maker developerloc houston txonli loca...,sr file maker developerlocation houston txonly...
2021-02-11,Web Design Director\nnew,National Instruments,3.7,https://www.indeed.com/rc/clk?jk=1243b577bcf6f...,Why NI?\nThere are many reasons to consider jo...,Austin,TX,0,ni many reason consider joining company key am...,why ni\nthere are many reasons to consider joi...,whi ni there are mani reason to consid join a ...,why ni there are many reason to consider joini...


### Brief Summary of the Job Postings

#### Data Scientist Position

In [8]:
# Which companies hire the most data scientists in TX?
df_ds.company.value_counts().head()

Cognizant Technology Solutions    59
Dell Technologies                 43
Deloitte                          36
Facebook                          35
USAA                              34
Name: company, dtype: int64

In [9]:
# Which cities have the most data scientists postions in TX?
df_ds.city.value_counts().head()

Austin         507
Dallas         287
Houston        228
San Antonio    141
Plano          133
Name: city, dtype: int64

In [10]:
# How does the number of job postings change over time?
df_ds.resample('W').title.count()

date
2020-12-27    392
2021-01-03    136
2021-01-10    212
2021-01-17    174
2021-01-24    312
2021-01-31    274
2021-02-07    194
2021-02-14     91
Freq: W-SUN, Name: title, dtype: int64

In [11]:
# Which company has the best avereage rating? 
df_ds.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
Digital Pharmacist    5.0
Rekruiters            5.0
Levelset              4.7
Harnham               4.6
DRW Trading Group     4.6
Name: company_rating, dtype: float64

#### Web Developer Position

In [12]:
# Which companies hire the most web developers in TX?
df_wd.company.value_counts().head()

CyberCoders                            134
Indeed                                 112
JPMorgan Chase Bank, N.A.               91
Infinity Consulting Solutions, Inc.     51
Cognizant Technology Solutions          42
Name: company, dtype: int64

In [13]:
# Which cities have the most web developers postions in TX?
df_wd.city.value_counts().head(4)

Austin     1029
Dallas      431
Houston     350
Plano       255
Name: city, dtype: int64

In [14]:
# How does the number of job postings change over time?
df_wd.resample('W').title.count()

date
2021-01-10      62
2021-01-17     346
2021-01-24     793
2021-01-31    1050
2021-02-07     695
2021-02-14     244
Freq: W-SUN, Name: title, dtype: int64

In [15]:
# Which company has the best avereage rating? 
df_wd.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
MarketScale            5.0
Box                    5.0
RightNow Ministries    5.0
Royal & Ross           5.0
Rekruiters             5.0
Name: company_rating, dtype: float64

### Top k Needed Skills

#### Data Scientist Position
- Tech Skills
- Soft Skills
- General Skills (Combination of Tech+Soft)

In [16]:
# Create a tech library
ds_tech_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language']

In [17]:
# Print the number of skills in the library
print("Number of data science skills in tech skill library: ", len(ds_tech_library))

# Print the top 5 needed tech skills

ds_top_tech = MVP_explore.top_skills(df_ds, 5, ds_tech_library)
ds_top_tech

Number of data science skills in tech skill library:  71


Unnamed: 0,frequency
machine learning,2820.0
python,1474.0
sql,1130.0
r,843.0
aws,795.0


In [18]:
# Create a soft skill library
ds_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork'] 

In [19]:
# Print the number of skills in the library
print("Number of data science skills in soft skill library: ", len(ds_soft_library))

# Print the top 5 needed soft skills

ds_top_soft = MVP_explore.top_skills(df_ds, 5, ds_soft_library)
ds_top_soft

Number of data science skills in soft skill library:  20


Unnamed: 0,frequency
communication,1256.0
leadership,776.0
collaboration,376.0
problem solving,275.0
written communication,221.0


In [20]:
# Create a general library
ds_general_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision', 'deep neural networks','neural networks','amazon web services',
                'natural language processing','extract,transform,load','mysql','structured query language'
                'critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork']

In [21]:
# Print the number of skills in the library
print("Number of data science skills in general skills library: ", len(ds_general_library))

# Print the top 5 needed general skills

ds_top_general = MVP_explore.top_skills(df_ds, 5, ds_general_library)
ds_top_general

Number of data science skills in general skills library:  90


Unnamed: 0,frequency
machine learning,2820.0
python,1474.0
communication,1256.0
sql,1130.0
r,843.0


#### Web Developer Position
- Tech skills
- Soft skills
- General Skills (Combination of Tech+Soft)

In [22]:
# Create a tech library
wd_tech_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net']  

In [23]:
# Print the number of skills in the library
print("Number of web dev skills in tech skill library: ", len(wd_tech_library))

# Print the top 5 needed tech skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_tech_library)
wd_top_tech

Number of web dev skills in tech skill library:  67


Unnamed: 0,frequency
javascript,2533.0
java,1657.0
sql,1572.0
react,1375.0
net,1260.0


In [24]:
# Create a soft library
wd_soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication','teamwork'] 

In [25]:
# Print the number of skills in the library
print("Number of web dev skills in soft skill library: ", len(wd_soft_library))

# Print the top 5 needed soft skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_soft_library)
wd_top_tech

Number of web dev skills in soft skill library:  20


Unnamed: 0,frequency
communication,1823.0
leadership,655.0
collaboration,423.0
problem solving,387.0
written communication,288.0


In [26]:
# Create a general library
wd_general_library = ['html','css','javascript','debugging','git','github','libraries','frameworks','publishing web site',
'photoshop','bootstrap','jquery','search engine optimization','seo','basic graphic design','front end','back end',
'visual studio code','visual studio','figma','zeplin','sketch','gatspy','strapi','paint','canva','aws','azure',
'amazon web services','sql','mysql','nosql','node','node.js','js','json','api','google charts','d3','d3.js','react',
'angular','ember','vue','python','c','c++','ruby','ruby on rails','git','github','php','net','.net','java','c#',
'linux','go','gcp','troubleshooting','problem solving','ux','ui','ux/ui','cloud computing','netlify','net''critical thinking',
'communication','problem solving','teamwork','ethics','business acumen','interpersonal skills','curiosity',
'storytelling','adaptability','team player','collaboration','time management','leadership','domain knowledge',
'creativity','decision making','verbal communication','written communication','teamwork']

In [27]:
# Print the number of skills in the library
print("Number of web dev skills in general skill library: ", len(wd_general_library))

# Print the top 5 needed general skills

wd_top_tech = MVP_explore.top_skills(df_wd, 5, wd_general_library)
wd_top_tech

Number of web dev skills in general skill library:  86


Unnamed: 0,frequency
javascript,2533.0
communication,1823.0
java,1657.0
sql,1572.0
react,1375.0


### Uploads

In [28]:
def upload_to_S3_bucket(file_name, bucket='dspreparedjobpostings', object_name=None):
    """
    Upload a file to an S3 bucket
    
    ***Prepared data files must be in JSON format***

    Parameters
    ----------
    file_name: str
        Name of the file to upload.
    
    bucket: str, default="dspreparedjobpostings"
        S3 Bucket the file will be uploaded to.
    
    object_name: str, default=None
        The file name that will appear in AWS S3 bucket.
        If an object_name is not specified, the file will
        have the same name as the file_name
    
    Returns
    -------
    True or False: bool
        True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [None]:
upload_to_S3_bucket("df_ds_tx_prepared.json")