In [1]:
import numpy as np
import pandas as pd
import sys
import re
import pprint
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction import text

In [2]:
ds_df = pd.read_csv('ds3k.csv')

In [3]:
print(ds_df.shape)
print(ds_df.columns)

(3000, 12)
Index(['Job Title', 'Salary Estimate', 'Job Description', 'Rating',
       'Company Name', 'Location', 'Size', 'Founded', 'Type of ownership',
       'Industry', 'Sector', 'Revenue'],
      dtype='object')


#### Rename Columns

In [4]:
ds_df.columns = ds_df.columns.str.replace(' ', '_')
ds_df.columns

Index(['Job_Title', 'Salary_Estimate', 'Job_Description', 'Rating',
       'Company_Name', 'Location', 'Size', 'Founded', 'Type_of_ownership',
       'Industry', 'Sector', 'Revenue'],
      dtype='object')

In [5]:
ds_df.head()

Unnamed: 0,Job_Title,Salary_Estimate,Job_Description,Rating,Company_Name,Location,Size,Founded,Type_of_ownership,Industry,Sector,Revenue
0,"Senior Data Scientist (San Francisco, CA)",$108K-$170K\n(Glassdoor est.),Earnest empowers people with the financial cap...,3.8,Earnest\n3.8,"San Francisco, CA",51 to 200 Employees,2013,Subsidiary or Business Segment,Lending,Finance,Unknown / Non-Applicable
1,Senior Machine Learning Data Scientist - Telec...,$80K-$129K\n(Glassdoor est.),Great outcomes are the result of strong purpos...,3.4,Unitedhealth Group\n3.4,"Eden Prairie, MN",10000+ Employees,1977,Company - Public,Health Care Services & Hospitals,Health Care,$10+ billion (USD)
2,Member Data Analyst,$43K-$77K\n(Glassdoor est.),"Blue Federal Credit Union employees work hard,...",4.1,Blue Federal Credit Union\n4.1,"Cheyenne, WY",201 to 500 Employees,-1,Other Organization,Banks & Credit Unions,Finance,Unknown / Non-Applicable
3,"Environmental - Geologist, Scientist or Engine...",-1,"Antea USA, Inc. (Antea®Group) is seeking motiv...",4.0,Antea Group\n4.0,United States,201 to 500 Employees,1986,Company - Private,Architectural & Engineering Services,Business Services,$50 to $100 million (USD)
4,Sr Data-Applied Scientist-PCCI,$12K-$112K\n(Glassdoor est.),"Looking for your next opportunity? If so, choo...",4.4,Parkland Center for Clinical Innovation\n4.4,"Dallas, TX",1 to 50 Employees,2012,Nonprofit Organization,IT Services,Information Technology,Unknown / Non-Applicable


#### Cleanup Salary_Estimate data
- Drop the rows that do not have salary estimate
- Remove additional text attached to salary value

In [6]:
# Drop the rows that do not have salary estimate
ds_df = ds_df[ds_df['Salary_Estimate'] != '-1']
print(ds_df.shape)

(2317, 12)


In [7]:
ds_df['Salary_Estimate'] = ds_df['Salary_Estimate'].str.replace('Employer Provided Salary:\n', '')
#ds_df

In [8]:
# remove additional text
ds_df['Salary_Estimate'] = ds_df['Salary_Estimate'].apply(lambda x: x.split('\n')[0])
#ds_df['Salary_Estimate'] = ds_ds['Salary_Estimate'].str.split('\n',expand=True)[0]

In [9]:
ds_df[ds_df['Salary_Estimate'].str.contains("Per Hour")]

Unnamed: 0,Job_Title,Salary_Estimate,Job_Description,Rating,Company_Name,Location,Size,Founded,Type_of_ownership,Industry,Sector,Revenue
39,Data Analyst,$24-$34 Per Hour,StarsHR has the following contract opportunity...,4.6,StarsHR\n4.6,"Tucson, AZ",1 to 50 Employees,2007,Company - Private,Staffing & Outsourcing,Business Services,Unknown / Non-Applicable
50,"Clinical Laboratory Scientist, Per Diem, Varia...",$38-$61 Per Hour,Adventist Health is more than an award-winning...,3.5,Adventist Health\n3.5,"Lancaster, CA",10000+ Employees,1973,Nonprofit Organization,Health Care Services & Hospitals,Health Care,Unknown / Non-Applicable
53,Medical Lab Scientist - MLS,$20-$28 Per Hour,We Are Hiring\n\nMedical Lab Scientist (MLS) -...,3.3,Ascension\n3.3,"Wichita, KS",10000+ Employees,1999,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD)
56,Medical Technologist/Laboratory Scientist,$11-$21 Per Hour,We at Biomedical Laboratories are searching fo...,2.5,Biomedical Laboratories\n2.5,"Hubbard, OH",501 to 1000 Employees,1973,Company - Public,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Less than $1 million (USD)
57,MLS - Medical Lab Scientist,$20-$28 Per Hour,We Are Hiring\n\nMLS Medical Lab Scientist - G...,3.3,Ascension\n3.3,"Wichita, KS",10000+ Employees,1999,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD)
66,MLS Medical Laboratory Scientist,$19-$26 Per Hour,We Are Hiring\n\nMLS Medical Laboratory Scient...,3.3,Ascension\n3.3,"Pittsburg, KS",10000+ Employees,1999,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD)
69,MLS Medical Lab Scientist - Part Time Nights/E...,$19-$26 Per Hour,We Are Hiring\n\nMLS Medical Lab Scientist - G...,3.3,Ascension\n3.3,"Pittsburg, KS",10000+ Employees,1999,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD)
70,MLS - Medical Lab Scientist - PRN,$20-$28 Per Hour,We Are Hiring\n\nMLS Medical Lab Scientist - G...,3.3,Ascension\n3.3,"Wichita, KS",10000+ Employees,1999,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD)
73,Medical Lab Scientist- Freestanding ER,$20-$27 Per Hour,We Are Hiring\n\nMedical Lab Scientist (MLS) -...,3.3,Ascension\n3.3,"Pensacola, FL",10000+ Employees,1999,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD)
75,Medical Lab Scientist-Freestanding ER-FT,$20-$27 Per Hour,We Are Hiring\n\nMedical Lab Scientist (MLS) -...,3.3,Ascension\n3.3,"Pensacola, FL",10000+ Employees,1999,Nonprofit Organization,Health Care Services & Hospitals,Health Care,$100 to $500 million (USD)


In [10]:
ds_df['hourly'] = ds_df['Salary_Estimate'].apply(lambda x: 1 if 'per hour' in x.lower() else 0)

In [11]:
ds_df['Salary_Estimate'] = ds_df['Salary_Estimate'].str.replace('Per Hour', '').str.replace('K', "").str.replace('$', '')

In [12]:
ds_df['Salary_Estimate'] = ds_df['Salary_Estimate'].str.replace('K', "").str.replace('$', '')

In [13]:
ds_df['min_salary'] = ds_df['Salary_Estimate'].apply(lambda x: int(x.split('-')[0]))
ds_df['max_salary'] = ds_df['Salary_Estimate'].apply(lambda x: int(x.split('-')[1]))

In [14]:
# Hourly pay to annual conversion
ds_df['min_salary'] = ds_df.apply(lambda x: x.min_salary*2 if x.hourly ==1 else x.min_salary, axis =1)
ds_df['max_salary'] = ds_df.apply(lambda x: x.max_salary*2 if x.hourly ==1 else x.max_salary, axis =1)

In [15]:
ds_df['avg_salary'] = (ds_df.min_salary + ds_df.max_salary)/2

In [16]:
ds_df['Company_Name'] = ds_df['Company_Name'].apply(lambda x: x.split('\n')[0])

In [17]:
ds_df[~ds_df["Location"].str.contains(",")]

Unnamed: 0,Job_Title,Salary_Estimate,Job_Description,Rating,Company_Name,Location,Size,Founded,Type_of_ownership,Industry,Sector,Revenue,hourly,min_salary,max_salary,avg_salary
6,Senior Azure Data Engineer - Minneapolis,90-165,Data & Analytics Professional: Transform Infor...,4.3,Magenic,St Louis Park,501 to 1000 Employees,1995,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),0,90,165,127.5


In [18]:
ds_df.loc[ds_df['Job_Title'] == 'Senior Azure Data Engineer - Minneapolis', 'Location'] = 'St Louis Park, MN'
ds_df[ds_df["Location"].str.contains("St Louis Park")]

Unnamed: 0,Job_Title,Salary_Estimate,Job_Description,Rating,Company_Name,Location,Size,Founded,Type_of_ownership,Industry,Sector,Revenue,hourly,min_salary,max_salary,avg_salary
6,Senior Azure Data Engineer - Minneapolis,90-165,Data & Analytics Professional: Transform Infor...,4.3,Magenic,"St Louis Park, MN",501 to 1000 Employees,1995,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),0,90,165,127.5


In [19]:
ds_df['City'] = ds_df['Location'].apply(lambda x: x.split(',')[0])
ds_df['State'] = ds_df['Location'].apply(lambda x: x.split(',')[1])

In [20]:
ds_df['Years_Old'] = ds_df.Founded.apply(lambda x: x if x < 0 else 2020 - x)

In [21]:
ds_df['Job_Title']=ds_df['Job_Title'].str.replace('\W', ' ')
ds_df['Job_Title']=[x.upper() for x in ds_df['Job_Title']]
ds_df.loc[ds_df.Job_Title.str.contains("SCIENTIST"), 'Job_Title'] = 'Data Scientist'
ds_df.loc[ds_df.Job_Title.str.contains('DATA SCIENCE'),'Job_Title']='Data Scientist'

ds_df.loc[ds_df.Job_Title.str.contains('MACHINE LEARNING'),'Job_Title']='Machine Learning Engineer'

ds_df.loc[ds_df.Job_Title.str.contains('ANALYST'), 'Job_Title'] = 'Data Analyst'
ds_df.loc[ds_df.Job_Title.str.contains('STATISTICIAN'), 'Job_Title'] = 'Data Analyst'
ds_df.loc[ds_df.Job_Title.str.contains('ANALYTICS'), 'Job_Title'] = 'Data Analyst'

ds_df.loc[ds_df.Job_Title.str.contains('DATA ENGINEER'), 'Job_Title'] = 'Data Engineer'

ds_df.loc[ds_df.Job_Title.str.contains('MANAGER'),'Job_Title']='Data Science Manager'
ds_df.loc[ds_df.Job_Title.str.contains('CONSULTANT'),'Job_Title']='Data Science Manager'
ds_df.loc[ds_df.Job_Title.str.contains('DIRECTOR'),'Job_Title']='Data Science Manager'
ds_df.loc[ds_df.Job_Title.str.contains('PRINCIPAL'), 'Job_Title']='Data Science Manager'

ds_df.Job_Title=ds_df[(ds_df.Job_Title == 'Data Scientist') | 
                      (ds_df.Job_Title == 'Data Analyst') | 
                      (ds_df.Job_Title == 'Machine Learning Engineer') | 
                      (ds_df.Job_Title == 'Data Science Manager') |
                      (ds_df.Job_Title == 'Data Engineer')]
ds_df.Job_Title=['Others' if x is np.nan else x for x in ds_df.Job_Title]
ds_df.head()

Unnamed: 0,Job_Title,Salary_Estimate,Job_Description,Rating,Company_Name,Location,Size,Founded,Type_of_ownership,Industry,Sector,Revenue,hourly,min_salary,max_salary,avg_salary,City,State,Years_Old
0,Data Scientist,108-170,Earnest empowers people with the financial cap...,3.8,Earnest,"San Francisco, CA",51 to 200 Employees,2013,Subsidiary or Business Segment,Lending,Finance,Unknown / Non-Applicable,0,108,170,139.0,San Francisco,CA,7
1,Data Scientist,80-129,Great outcomes are the result of strong purpos...,3.4,Unitedhealth Group,"Eden Prairie, MN",10000+ Employees,1977,Company - Public,Health Care Services & Hospitals,Health Care,$10+ billion (USD),0,80,129,104.5,Eden Prairie,MN,43
2,Data Analyst,43-77,"Blue Federal Credit Union employees work hard,...",4.1,Blue Federal Credit Union,"Cheyenne, WY",201 to 500 Employees,-1,Other Organization,Banks & Credit Unions,Finance,Unknown / Non-Applicable,0,43,77,60.0,Cheyenne,WY,-1
4,Data Scientist,12-112,"Looking for your next opportunity? If so, choo...",4.4,Parkland Center for Clinical Innovation,"Dallas, TX",1 to 50 Employees,2012,Nonprofit Organization,IT Services,Information Technology,Unknown / Non-Applicable,0,12,112,62.0,Dallas,TX,8
6,Data Engineer,90-165,Data & Analytics Professional: Transform Infor...,4.3,Magenic,"St Louis Park, MN",501 to 1000 Employees,1995,Company - Private,IT Services,Information Technology,$100 to $500 million (USD),0,90,165,127.5,St Louis Park,MN,25


In [22]:
ds_df['Job_Title'].value_counts()

Data Scientist               1451
Data Analyst                  338
Data Engineer                 277
Others                        232
Machine Learning Engineer      12
Data Science Manager            7
Name: Job_Title, dtype: int64

#### Job Description Analysis

In [23]:
print(ds_df['Job_Description'][0])

Earnest empowers people with the financial capital they need to live better lives.

We're an accomplished team of technology, finance, and design geeks who believe consumer lending can be radically improved and are doing something about it. We are disrupting the trillion dollar student loan industry by redefining what it means to be creditworthy. We created a company that combines data science, streamlined design, and technology to:
Build products that simplify the lending process.
Personalize loans to suit the needs of our customers.
Engage with our customers through more human experiences.
Our culture is one that values transparency and blameless problem solving. Earnest has a strong track record of employee growth and career progression. Earnies are empathetic, product-focused, proactive, and curious.

As the Senior Data Scientist, you will report to the Head of Analytics and:
Prototype machine learning models that encapsulate best practices around feature engineering, model selecti

In [24]:
# Combine the desciptions by the job tilte
data=ds_df.copy()
data = data.groupby('Job_Title').agg(lambda col: ' '.join(col))
data = data[['Job_Description']]

In [25]:
# Create a function to clean text data
def clean_text(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text).lower() #remove punctutations
    text = re.sub('\w*\d\w*', '', text) # removes words that contain numbers in between them 
    text = re.sub('[‘’“”…]', '', text) # removes the special characters mentioned within the brackets
    text = re.sub('\n',' ',text) # replaces \n with a space
    return text

In [26]:
# Clean the text data and remove the job title 'Others'
clean = lambda x :clean_text(x)
df_clean = pd.DataFrame(data.Job_Description.apply(clean))
df_clean = df_clean[df_clean.index != 'Others'].copy()

In [27]:
# Lemmentize the text data to improve analysis
lemmer = WordNetLemmatizer()
df_clean['Job_Description'] = df_clean.Job_Description.apply(lambda x: word_tokenize(x))
df_clean['Job_Description'] = df_clean.Job_Description.apply(lambda x : [lemmer.lemmatize(y) for y in x])
df_clean['Job_Description'] = df_clean.Job_Description.apply(lambda x: ' '.join(x))

In [28]:
# Add words that frequently appear in the descriptions but carry no value to the list of stop words

extra_stopword = ['data','experience','work','team','will','skill','year','skills']
stop_words = text.ENGLISH_STOP_WORDS.union(extra_stopword)

In [29]:
df_clean.head()

Unnamed: 0_level_0,Job_Description
Job_Title,Unnamed: 1_level_1
Data Analyst,blue federal credit union employee work hard h...
Data Engineer,data analytics professional transform informat...
Data Science Manager,who we are the analytic data and machine learn...
Data Scientist,earnest empowers people with the financial cap...
Machine Learning Engineer,position purpose this position is responsible ...


In [43]:
df_clean

Unnamed: 0_level_0,Job_Description
Job_Title,Unnamed: 1_level_1
Data Analyst,blue federal credit union employee work hard h...
Data Engineer,data analytics professional transform informat...
Data Science Manager,who we are the analytic data and machine learn...
Data Scientist,earnest empowers people with the financial cap...
Machine Learning Engineer,position purpose this position is responsible ...


#### Years of experience

In [30]:
text = ds_df.Job_Description.values
text

array(["Earnest empowers people with the financial capital they need to live better lives.\n\nWe're an accomplished team of technology, finance, and design geeks who believe consumer lending can be radically improved and are doing something about it. We are disrupting the trillion dollar student loan industry by redefining what it means to be creditworthy. We created a company that combines data science, streamlined design, and technology to:\nBuild products that simplify the lending process.\nPersonalize loans to suit the needs of our customers.\nEngage with our customers through more human experiences.\nOur culture is one that values transparency and blameless problem solving. Earnest has a strong track record of employee growth and career progression. Earnies are empathetic, product-focused, proactive, and curious.\n\nAs the Senior Data Scientist, you will report to the Head of Analytics and:\nPrototype machine learning models that encapsulate best practices around feature engineeri

In [31]:
# Print out the first 5 examples of matches
limit = 0
for t in text:
    for sentence in t.split('\n'):
        if 'experience' in sentence:
            year = re.findall("\d{1,2}\+? year", sentence) # find all digits repeated once or twice followed by optional +
            if len(year)==1:
                print(year[0])
                print(sentence)
                print("*"*123)
                limit +=1
    if limit >= 5:
        break

2+ year
2+ years experience building production-ready machine learning models and systems
***************************************************************************************************************************
2+ year
2+ years of experience implementing Azure Data Solutions for high-availability, disaster recovery and global distribution
***************************************************************************************************************************
10 year
· 3 - 10 years of related experience
***************************************************************************************************************************
10+ year
· 10+ years of related experience
***************************************************************************************************************************
3+ year
3+ years' experience in data analysis and visualization
***************************************************************************************************************************


In [32]:
# Compile the year value found into a list
experience_req = []
for t in text:
    for sentence in t.split('\n'):
        if 'experience' in sentence:
            year = re.findall("\d{1,2}\+? year", sentence)
            if len(year)==1:
                num = year[0].split(' ')
                experience_req.append(num[0])

# Remove the '+' sign after year value
for n,i in enumerate(experience_req):
    if "+" in i:
        experience_req[n] = re.sub(r'\+','',i)
experience_req = [int(item) for item in experience_req]

# Remove outliers
for n,i in enumerate(experience_req):
    if i >= 20:
        experience_req.pop(n)

In [42]:
df_clean.shape

(5, 1)

#### Skill requirement

In [None]:
# Create a regex search function
def count_text(patt,text):
    pattern = re.compile(patt)
    count = 0
    for t in text:
        if pattern.search(t):
            count+=1
    return count

In [None]:
# Create a data frame with skills name and regex pattern to search with
skills = ['R','Python','Hadoop','SQL','Tableau','NoSQL','Power BI','SAS','Hive','Spark', \
          'Java','JavaScript']

skill_patt = ['\WR\W+\s*','(?i)\WPython\W','(?i)\WHadoop\W?','(?i)SQL\w*','(?i)\WTableau\W?',
              "(?i)\WNoSQL\W?","(?i)\WPower\s?BI\W?",
             "(?i)\WSAS\W?","(?i)\WHive\W?","(?i)\WSpark?\W?",'(?i)Java\w*','(?i)\WJavaScript\W?']

skill_df =pd.DataFrame(
    {"skill": skills,
     "regex_pattern":skill_patt})

In [None]:
# Iterate through the list of skill using the search function created
i = []
for x in skill_df['regex_pattern']:
    i.append(count_text(x,text))
skill_df['count'] = i
skill_df['pct'] = round(skill_df['count']/len(text),2)
skill_df

#### Educaton requirement

In [None]:
# Define regex pattern and seach for PhD
pattern = re.compile('(?i)\WPh.?D\W')
pattern2 = re.compile('(?i)\WDoctorate\W')
count = 0
for t in text:
    if pattern.search(t):
        count +=1
    elif pattern2.search(t):
        count +=1
degree = {"PhD": count}

In [None]:
# Define regex pattern and seach for Master 
pattern = re.compile("(?i)\WMasters?'?s?\W")
pattern2 = re.compile('(?i)\WM.?S\W')
count = 0
for t in text:
    if pattern.search(t):
        count +=1
    elif pattern2.search(t):
        count +=1
degree.update({"Masters":count})

In [None]:
# Define regex pattern and seach for Master 
pattern = re.compile("(?i)\WBachelors?'?s?\W")
pattern2 = re.compile('(?i)\WBachelor.?S\W')
count = 0
for t in text:
    if pattern.search(t):
        count +=1
    elif pattern2.search(t):
        count +=1
degree.update({"Bachelors Degree":count})

In [None]:
degree = pd.DataFrame.from_dict(degree,orient='index',
                       columns=[ 'count'])
degree['pct'] = degree['count']/len(text)

In [39]:
s = "string. With. Punctuation?" # Sample string 
out = re.sub('[%s]' % re.escape(string.punctuation), '', s)
out

'string With Punctuation'

In [None]:
data = ds_df.copy()
data['Job_Title']=[x.upper() for x in data['Job_Title']]
data.loc[data.Job_Title.str.contains("SCIENTIST"), 'Job_Title'] = 'Data Scientist'

data.loc[data.Job_Title.str.contains('ENGINEER'),'Job_Title']='Machine Learning Engineer'
data.loc[data.Job_Title.str.contains('PRINCIPAL STATISTICAL PROGRAMMER'),'Job_Title']='Machine Learning Engineer'
data.loc[data.Job_Title.str.contains('PROGRAMMER'),'Job_Title']='Machine Learning Engineer'
data.loc[data.Job_Title.str.contains('DEVELOPER'),'Job_Title']='Machine Learning Engineer'

data.loc[data.Job_Title.str.contains('ANALYST'), 'Job_Title'] = 'Data Analyst'
data.loc[data.Job_Title.str.contains('STATISTICIAN'), 'Job_Title'] = 'Data Analyst'

data.loc[data.Job_Title.str.contains('MANAGER'),'Job_Title']='Data Science Manager'
data.loc[data.Job_Title.str.contains('CONSULTANT'),'Job_Title']='Data Science Manager'
data.loc[data.Job_Title.str.contains('DATA SCIENCE'),'Job_Title']='Data Science Manager'
data.loc[data.Job_Title.str.contains('DIRECTOR'),'Job_Title']='Data Science Manager'

data.Job_Title=data[(data.Job_Title == 'Data Scientist') | (data.Job_Title == 'Data Analyst') | (data.Job_Title == 'Machine Learning Engineer') | (data.Job_Title == 'Data Science Manager') ]
data.Job_Title=['Others' if x is np.nan else x for x in data.Job_Title]
data.head()

In [None]:
title = data.groupby(['Job_Title']).count().sort_values('Company_Name')

title['Company_Name'].plot(kind='barh',figsize = (10,5))
plt.xlabel('Count', size = 12)
plt.ylabel('')
plt.yticks(size = 10)
plt.xticks(size = 10)
plt.title('Number of Positions by Job Title', size = 20)
plt.show()

In [None]:
company = ds_df.groupby(['Company_Name']).count().sort_values('Job_Title').tail(20)

company['Job_Title'].plot(kind='barh',figsize = (10,5))
plt.xlabel('Count', size = 12)
plt.ylabel('')
plt.yticks(size = 10)
plt.xticks(size = 10)
plt.title('Number of Positions by Companies (Top 20)', size = 20)
plt.show()

In [None]:
city = ds_df.groupby(['Location']).count().sort_values('Job_Title').tail(20)

city['Job_Title'].plot(kind='barh',figsize = (10,5))
plt.xlabel('Count', size = 12)
plt.ylabel('')
plt.yticks(size = 10)
plt.xticks(size = 10)
plt.title('Number of Positions by Cities (Top 20)', size = 20)
plt.show()

In [None]:
state = ds_df.groupby('State').count().sort_values('Job_Title',ascending = False)

state['Job_Title'].plot(kind = 'bar',figsize = (10,5) ,width = 0.85)
plt.xlabel('')
plt.ylabel('Count',size = 12)
plt.title('Number of Positions by State', size = 20)
plt.yticks(size = 10)
plt.xticks(size = 10, rotation = 720)
plt.show()

In [None]:
data = data[data['Job_Title'] != 'Others']
i = 1
color = ['#A92420','#8A6FDF','#135390','#FDA649']
fig = plt.figure(figsize=(20,10))
for title in data.Job_Title.unique():
    x = data[data['Job_Title']== str(title)].groupby(['State']).count().sort_values('Company_Name')
    plt.subplot(2, 2, i)
    i += 1
    plt.bar(x.index,x['Company_Name'], color = color[i-2])
    plt.xlabel('')
    plt.xticks(size = 10)
    plt.title(str(title), size = 15)
plt.show()

#### number of years exerience plot

In [None]:
plt.figure(figsize = (10,5))
plt.hist(experience_req,bins = list(range(0,21,2)), align = 'left')
plt.title('Experience Required Distribution', size = 15)
plt.ylabel('Bin Count')
plt.xlabel('Year of Expereience', size = 12)
plt.show()
print(f'The average year of experience required is {round(np.mean(experience_req),2)} years')

#### Skills plot

In [None]:
x = skill_df.sort_values(by = 'pct')
ax =x['pct'].plot(kind = "barh",figsize = (10,5))
ax.set_title('Skills as Percentage of Total Job Description', size = 15)
ax.set_yticklabels(x['skill'], size = 12)
ax.set_xticklabels(['{:.0%}'.format(x) for x in ax.get_xticks()])
plt.show()

#### Education qualification plot

In [None]:
ax =degree['pct'].plot(kind = "bar", figsize =(10,5))
ax.set_title('Percentage of Total Documents')
ax.set_xticklabels(degree.index)
ax.set_yticklabels(['{:.0%}'.format(x) for x in ax.get_yticks()])
plt.show()