# MVP

In [1]:
# General Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# AWS Libraries
import logging
import boto3
from botocore.exceptions import ClientError

# NLP Libraries
import unicodedata
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud

# Helper Functions
import MVP_acquire_ds, MVP_explore

import warnings
warnings.filterwarnings("ignore")

## Exploration

### Download Prepared Data from AWS S3 Bucket

In [2]:
# Create the s3 resource object
s3 = boto3.resource('s3')

# Download the json files from AWS

s3.Bucket("dspreparedjobpostings").download_file("df_ds_tx_prepared_backup.json", 
                                                 "df_ds_tx_prepared_backup.json")
s3.Bucket("wdpreparedjobpostings").download_file("df_wd_tx_prepared_backup.json", 
                                                 "df_wd_tx_prepared_backup.json")

### Load Preapred Data into Dataframe and Set the Date as the Index

In [2]:
# Read the json file and convert it to pandas dataframe

df_ds = pd.read_json('df_ds_tx_prepared_backup.json')
df_wd = pd.read_json('df_wd_tx_prepared_backup.json')

# Set the date column as the index and sort the index

df_ds.date = pd.to_datetime(df_ds.date)
df_ds = df_ds.set_index('date').sort_index(ascending=False)
df_wd.date = pd.to_datetime(df_wd.date)
df_wd = df_wd.set_index('date').sort_index(ascending=False)

# Print the number of job postings in each dataframe

print("Number of data scientist job postings in TX: ", df_ds.shape[0])
print("Number of web developer job postings in TX: ", df_wd.shape[0])

Number of data scientist job postings in TX:  1597
Number of web developer job postings in TX:  3026


In [3]:
# Print the concise summary of the dataframe df_ds
df_ds.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1597 entries, 2021-02-09 to 2020-12-22
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            1597 non-null   object 
 1   company          1597 non-null   object 
 2   company_rating   1597 non-null   float64
 3   job_link         1597 non-null   object 
 4   job_description  1597 non-null   object 
 5   city             1597 non-null   object 
 6   state            1597 non-null   object 
 7   zipcode          1597 non-null   int64  
 8   clean            1597 non-null   object 
 9   tokenized        1597 non-null   object 
 10  stemmed          1597 non-null   object 
 11  lemmatized       1597 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 162.2+ KB


In [4]:
# Print the concise summary of the dataframe df_wd
df_wd.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 3026 entries, 2021-02-10 to 2021-01-04
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   title            3026 non-null   object 
 1   company          3026 non-null   object 
 2   company_rating   3026 non-null   float64
 3   job_link         3026 non-null   object 
 4   job_description  3026 non-null   object 
 5   city             3026 non-null   object 
 6   state            3026 non-null   object 
 7   zipcode          3026 non-null   int64  
 8   clean            3026 non-null   object 
 9   tokenized        3026 non-null   object 
 10  stemmed          3026 non-null   object 
 11  lemmatized       3026 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 307.3+ KB


In [5]:
# Print the first 5 rows of the dataframe df_ds
df_ds.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean,tokenized,stemmed,lemmatized
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-09,"Director, Data Science\nnew","EmployBridge, LLC.",3.2,https://www.indeed.com/rc/clk?jk=1f65699388974...,Your Opportunity:\n\nDevelop a team and mentor...,Farmers Branch,TX,75234,opportunity develop team mentor manager develo...,your opportunity\n\ndevelop a team and mentor ...,your opportun develop a team and mentor manag ...,your opportunity develop a team and mentor man...
2021-02-09,"Associate, Data Scientist, Intelligent Forecas...",KPMG,4.0,https://www.indeed.com/rc/clk?jk=5980735216bf6...,Innovate. Collaborate. Build. Create. Solve. T...,Dallas,TX,0,innovate collaborate build create solve kpmg d...,innovate collaborate build create solve the kp...,innov collabor build creat solv the kpmg digit...,innovate collaborate build create solve the kp...
2021-02-09,Data Scientist/Machine Learning Engineer\nnew,ConnectedX Inc.,0.0,"https://www.indeed.com/company/Connectedx,-Inc...",Machine Learning/ Data ScientistLocation: Plan...,Dallas,TX,75207,machine learning data scientistlocation plano ...,machine learning data scientistlocation plano ...,machin learn data scientistloc plano txdurat l...,machine learning data scientistlocation plano ...
2021-02-09,Head of Cancer Artificial Intelligence\nnew,Larvol,0.0,https://www.indeed.com/company/The-Larvol-Grou...,Head of Cancer Artificial IntelligenceFull-tim...,Austin,TX,0,head cancer artificial intelligencefulltime 10...,head of cancer artificial intelligencefulltime...,head of cancer artifici intelligencefulltim 10...,head of cancer artificial intelligencefulltime...
2021-02-09,"Applied Researcher, NLP\nnew",eBay Inc.,3.9,https://www.indeed.com/rc/clk?jk=f1c17d175e718...,"NLP Applied Researcher, Job Description\nDo yo...",Austin,TX,0,nlp applied researcher job description want hu...,nlp applied researcher job description\ndo you...,nlp appli research job descript do you want to...,nlp applied researcher job description do you ...


In [6]:
# Print the first 5 rows of the dataframe df_wd
df_wd.head()

Unnamed: 0_level_0,title,company,company_rating,job_link,job_description,city,state,zipcode,clean,tokenized,stemmed,lemmatized
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-02-10,Infrastructure Engineer- ASP.NET Full Stack En...,Wells Fargo,3.7,https://www.indeed.com/rc/clk?jk=6b2cdf2c3a690...,Job Description\nImportant Note: During the ap...,Dallas,TX,0,job description important note application pro...,job description\nimportant note during the app...,job descript import note dure the applic proce...,job description important note during the appl...
2021-02-10,Ruby on Rails Developer\nnew,Reynolds and Reynolds,3.2,https://www.indeed.com/company/The-Reynolds-an...,As a Ruby on Rails Developer you will build so...,College Station,TX,77845,ruby rail developer build something meaningful...,as a ruby on rails developer you will build so...,as a rubi on rail develop you will build somet...,a a ruby on rail developer you will build some...
2021-02-10,Full Stack Developer - Cognizant Digital Engin...,Cognizant Technology Solutions,3.9,https://www.indeed.com/rc/clk?jk=b80fecc823648...,We are Cognizant Digital Engineering\n\nCogniz...,Dallas,TX,75201,cognizant digital engineering cognizant digita...,we are cognizant digital engineering\n\ncogniz...,we are cogniz digit engin cogniz digit engin d...,we are cognizant digital engineering cognizant...
2021-02-10,Test Engineer\nnew,Tech mahindra,3.7,https://www.indeed.com/rc/clk?jk=a2ef5088b49fc...,Skill Set: DEVICE TESTING Total Experience: 3....,Dallas,TX,0,skill set device testing total experience 300 ...,skill set device testing total experience 300 ...,skill set devic test total experi 300 to 1000 ...,skill set device testing total experience 300 ...
2021-02-10,Senior Front End Engineer\nnew,Ad Hoc Team,0.0,https://www.indeed.com/rc/clk?jk=d0cda23c1ec27...,This is a remote position.\n\nThis position wi...,San Antonio,TX,0,remote position position open accepting applic...,this is a remote position\n\nthis position wil...,thi is a remot posit thi posit will be open an...,this is a remote position this position will b...


### Brief Summary of the Job Postings

#### Data Scientist Position

In [7]:
# Which companies hire the most data scientists in TX?
df_ds.company.value_counts().head()

Cognizant Technology Solutions    57
Dell Technologies                 41
Facebook                          34
Deloitte                          32
USAA                              31
Name: company, dtype: int64

In [8]:
# Which cities have the most data scientists postions in TX?
df_ds.city.value_counts().head()

Austin         450
Dallas         261
Houston        203
Plano          124
San Antonio    122
Name: city, dtype: int64

In [9]:
# How does the number of job postings change over time?
df_ds.resample('W').title.count()

date
2020-12-27    392
2021-01-03    136
2021-01-10    212
2021-01-17    130
2021-01-24    302
2021-01-31    258
2021-02-07    143
2021-02-14     24
Freq: W-SUN, Name: title, dtype: int64

In [10]:
# Which company has the best avereage rating? 
df_ds.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
Rekruiters           5.0
Levelset             4.7
DRW Trading Group    4.6
Atlassian            4.6
Collage.com          4.5
Name: company_rating, dtype: float64

#### Web Developer Position

In [11]:
# Which companies hire the most web developers in TX?
df_wd.company.value_counts().head()

CyberCoders                            125
Indeed                                 109
JPMorgan Chase Bank, N.A.               89
Infinity Consulting Solutions, Inc.     51
Cognizant Technology Solutions          41
Name: company, dtype: int64

In [12]:
# Which cities have the most web developers postions in TX?
df_wd.city.value_counts().head(4)

Austin     980
Dallas     411
Houston    326
Plano      244
Name: city, dtype: int64

In [13]:
# How does the number of job postings change over time?
df_wd.resample('W').title.count()

date
2021-01-10      62
2021-01-17     346
2021-01-24     793
2021-01-31    1047
2021-02-07     634
2021-02-14     144
Freq: W-SUN, Name: title, dtype: int64

In [14]:
# Which company has the best avereage rating? 
df_wd.groupby('company').company_rating.mean().sort_values(ascending=False).head()

company
Royal & Ross           5.0
Box                    5.0
MarketScale            5.0
RightNow Ministries    5.0
Rekruiters             5.0
Name: company_rating, dtype: float64

### Top k Needed Skills

#### Data Scientist Position
- Tech Skills
- Soft Skills

In [15]:
# Create a tech library
tech_library = ['python','sql','pandas','numpy','matplotlib','scikit learn','scikitlearn','spark','hadoop',
                'aws','amazon web services','azure','microsoft word', 'microsoft excel','excel','tableau',
                'tensor flow','pytorch','hive','impala','matlab','etl','statistics','exploration',
                'extraction','data wrangling','math','machine learning','data visualization','java','js',
                'javascript','scala','r','c','c++','power bi','dashboard','linear algebra','calculus',
                'neural networks','eda','big data','frameworks','database management','testing hypotheses',
                'probability','data mining','perl','nosql','saas','git','github','natural language processing',
                'nlp', 'deep learning','agile','kanban','project management','julia','devops','google cloud',
                'pytorch','computer vision']

In [16]:
# Print the number of skills in the library
print("Number of skills in tech skill library: ", len(tech_library))

# Print the top 20 needed tech skills

ds_top_tech = MVP_explore.top_skills(df_ds, 20, tech_library)
ds_top_tech

Number of skills in tech skill library:  64


Unnamed: 0,frequency
machine learning,2521.0
python,1329.0
sql,1012.0
r,760.0
aws,689.0
big data,622.0
spark,569.0
hadoop,539.0
c,442.0
java,439.0


In [17]:
# Create a soft skill library
soft_library = ['critical thinking','communication','problem solving','teamwork','ethics','business acumen',
                'interpersonal skills','curiosity','storytelling','adaptability','team player','collaboration',
                'time management','leadership','domain knowledge','creativity','decision making',
                'verbal communication','written communication']

In [18]:
# Print the number of skills in the library
print("Number of skills in soft skill library: ", len(soft_library))

# Print the top 10 needed soft skills

ds_top_soft = MVP_explore.top_skills(df_ds, 10, soft_library)
ds_top_soft

Number of skills in soft skill library:  19


Unnamed: 0,frequency
communication,1122.0
leadership,707.0
collaboration,348.0
problem solving,244.0
written communication,199.0
decision making,146.0
verbal communication,109.0
curiosity,93.0
creativity,91.0
team player,84.0


#### Web Developer Position
- Tech skills
- Soft skills

In [6]:
def upload_to_S3_bucket(file_name, bucket='dspreparedjobpostings', object_name=None):
    """
    Upload a file to an S3 bucket
    
    ***Prepared data files must be in JSON format***

    Parameters
    ----------
    file_name: str
        Name of the file to upload.
    
    bucket: str, default="dspreparedjobpostings"
        S3 Bucket the file will be uploaded to.
    
    object_name: str, default=None
        The file name that will appear in AWS S3 bucket.
        If an object_name is not specified, the file will
        have the same name as the file_name
    
    Returns
    -------
    True or False: bool
        True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = file_name

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [7]:
upload_to_S3_bucket("df_ds_tx_prepared.json")

NoCredentialsError: Unable to locate credentials