In [38]:
import pandas as pd
import sqlite3
import re

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import SGDClassifier, LogisticRegression

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

from sklearn import metrics

# Project 4: NLP, NLP, NLP

We will be doing three classifications and attempting to kill two ~~birds~~ questions with one ~~stone~~ classification model - which is number (2) and the BONUS question.

## Bringing in my tables from SQL

In [2]:
connection = sqlite3.connect('job_scraped.db.sqlite')

In [4]:
sql_query = '''
SELECT *
FROM jobs
'''

In [5]:
df = pd.read_sql(sql_query, con=connection)

In [6]:
df.head()

Unnamed: 0,description,id,title
0,Dubbed “The Most Innovative Streaming Video Se...,2abc0718a6987824,Data Analyst
1,Title: Data Analyst\n\nRole:\n\nReporting to t...,c95dd846041ca1c5,Data Analyst
2,What is the job ?\n\n\nThe role will involve m...,59df81c8a00dc826,Junior Data Analyst
3,Data Analyst - (180000E0)\nDescription\n\nUnde...,6d9971d09cb788a6,Data Analyst
4,Your Responsibilities :\n\nThe Role\n\n\nIn th...,b4c4860763098784,Data Insights Analyst


In [7]:
# Check for duplicates
df['id'].duplicated(keep='first').sum()

0

Hurray! No duplicates!

## 1) Data Scientists versus others
What components of a job posting distinguish data scientists from other data jobs?

### Data Scientist vs Data Analyst vs Data Engineer

In [23]:
# Extracting all jobs with "Data Science" in the title

only_ds = df[df['title'].isin(['Data Scientist'])].copy()
only_ds['label'] = 'DS'

In [24]:
only_da = df[df['title'].isin(['Data Analyst'])].copy()
only_da['label'] = 'DA'

In [25]:
only_de = df[df['title'].isin(['Data Engineer'])].copy()
only_de['label'] = 'DE'

In [34]:
ds_vs_others = pd.concat([only_ds,only_da,only_de], ignore_index=True)

# Dropping ID and Title because we don't need thouse anymore.
ds_vs_others.drop(columns=['id','title'], inplace=True)

In [35]:
ds_vs_others.head()

Unnamed: 0,description,label
0,PURPOSE OF THE ROLE\nThis role of a data scien...,DS
1,Work cross-functionally with business managers...,DS
2,Serve as primary source of data insights suppo...,DS
3,"Big data, artificial intelligence and advanced...",DS
4,As the innovation leader in the logistics indu...,DS


In [36]:
ds_vs_others['label'].value_counts()

DS    26
DE    23
DA    18
Name: label, dtype: int64

WOOT! I have a balanced dataset :) albeit small.... but let's see if we can infer anything from it using NLP.

### Simple cleaning, no stemming or lemmitization or removal of stop words because it is so small (rows = 60+).

In [45]:
# Removing punctuation, numbers and lowercase all
ds_vs_others['clean'] = ds_vs_others['description'].apply(lambda x: re.sub("[^a-zA-Z]", " ", x).lower())

In [46]:
ds_vs_others.head()

Unnamed: 0,description,label,clean
0,PURPOSE OF THE ROLE\nThis role of a data scien...,DS,purpose of the role this role of a data scient...
1,Work cross-functionally with business managers...,DS,work cross functionally with business managers...
2,Serve as primary source of data insights suppo...,DS,serve as primary source of data insights suppo...
3,"Big data, artificial intelligence and advanced...",DS,big data artificial intelligence and advanced...
4,As the innovation leader in the logistics indu...,DS,as the innovation leader in the logistics indu...


In [47]:
X = ds_vs_others['clean']
y = ds_vs_others['label']

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [50]:
pipe = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('clf', SGDClassifier())
])

In [51]:
model = pipe.fit(X_train, y_train)



In [None]:
vectorizer = model.named_steps['vectorizer']

In [None]:
# pipe = Pipeline([
#     ('vectorizer': CountVectorizer()),
#     ('tfidfTrans': TfidfTransformer()),
#     ('clf': SGDClassifier())
# ])

## 2) What features are important for distinguishing junior vs. senior positions?

In [None]:
# # initialize stemmer
# stemmer = SnowballStemmer('english')

# # stem each word
# print [stemmer.stem(word) for word in review.words]