In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# NLP with Job Posting Data <br><br>

In this study we are going to deveop a machine learning model to predict the skills that are important for each job. We will use a data set of job postings data from indeed.ca

This data set has the job postings information related to 4 job titles: Data Scientist, Carpenter, Registered Nurse, and Customer Service. Using this data set, we are going to predict the top 10 most important skills for each of these titles.

https://monkeylearn.com/keyword-extraction/

What is Keyword Extraction?

Keyword extraction (also known as keyword detection or keyword analysis) is a text analysis technique that consists of automatically extracting the most important words and expressions in a text. It helps summarize the content of a text and recognize the main topics which are being discussed. 

## Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [None]:

df = pd.read_csv("../input/jobs-ds-carp/Jobs.csv", index_col=False)
df.head()

# Data Cleaning

According to IBM Data Analytics you can expect to spend up to 80% of your time cleaning data. 
https://towardsdatascience.com/data-cleaning-with-python-and-pandas-detecting-missing-values-3e9c6ebcf78b

![](http://)First we need to take care of the missing values

In [None]:
df.keys()

In [None]:
# Removing the extra columns

df=df[['Job Title', 'Company Name', 'Location',
       'job URL', 'Job Description', 'Group']]
df.head(3)

In [None]:
# Validating if there is any null value in each column, if the result is False, it means there is no null value in that column

for column in df.columns:
    if (df['Job Title'].isnull().unique() ==True):
        print ('column', column, 'has missing values')
    else:
        print ('column', column, 'is valid and has no missing values')

# Getting the number of job postings by job title groups

In [None]:
Number_of_Jobs = df.groupby('Group')['Job Title'].count()
Number_of_Jobs.columns=['Job Title ID', 'Number of Job Postings']
Number_of_Jobs

In [None]:
#xvals = Number_of_Jobs['Job Title ID']
#yvals = Number_of_Jobs['Number of Job Postings']

# Generating a bar chart for number of job postings by city:

# Getting number of job postings by city:
#jobs_by_city= df.groupby('Location')['Job Title'].count().sort_values(ascending=False)[:10]
#print('The number of prescriptions by the antibiotic form:\n', jobs_by_city)
colors = ['green', '#006fb9','#006fb9','#006fb9']


plt.figure(figsize=(10,8))
xvals = Number_of_Jobs.index
yvals = Number_of_Jobs.tolist()
print(xvals)
print(yvals)
plt.bar(xvals, yvals, color=colors)
plt.xticks(rotation=90)
#plt.margins(0.2)

plt.subplots_adjust(bottom=0.3, left=0.2)
plt.title('Number of Data Science Jobs by City (Top 10 Cities)')
plt.xlabel('City')
plt.ylabel('Number of Job Postings')
plt.xticks(np.arange(min(xvals), max(xvals)+1, 1.0),('Data Scientist','Carpenter','Registered Nurse','Customer Service Rep.'))
#plt.xticks('A','B','C','D')

#Having the y axis formatted as thousand separated
ax = plt.gca()
ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

for i, v in enumerate(yvals):
    ax.text(i+0.9, v+10, str(v))

## First we will take only two job titles, Data Scientist and Carpenter

In [None]:
df = df[df['Group']<3]


In [None]:
df['Group'] = np.where(df['Group']==1,0,1)

In [None]:
df.head(-1)

In [None]:
Number_of_Jobs = df.groupby('Group')['Job Title'].count()
Number_of_Jobs.columns=['Job Title ID', 'Number of Job Postings']

colors = ['green', '#006fb9','#006fb9','#006fb9']


plt.figure(figsize=(7,9))
xvals = Number_of_Jobs.index
yvals = Number_of_Jobs.tolist()
print(xvals)
print(yvals)
plt.bar(xvals, yvals, color=colors)
plt.xticks(rotation=90)
#plt.margins(0.2)

plt.subplots_adjust(bottom=0.3, left=0.2)
plt.title('Number of Data Science Jobs by City (Top 10 Cities)')
plt.xlabel('City')
plt.ylabel('Number of Job Postings')
plt.xticks(np.arange(min(xvals), max(xvals)+1, 1.0),('Data Scientist','Carpenter','Registered Nurse','Customer Service Rep.'))
#plt.xticks('A','B','C','D')

#Having the y axis formatted as thousand separated
ax = plt.gca()
ax.get_yaxis().set_major_formatter(plt.FuncFormatter(lambda x, loc: "{:,}".format(int(x))))

ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)

for i, v in enumerate(yvals):
    ax.text(i-0.05, v+10, str(v))

## As you see, the data set is unbalanced

# Machine Learning with Sklearn

In [None]:
df.head(3)

In [None]:
df.Group.unique()

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Job Description'],
                                                   df['Group'],
                                                   random_state=0)

In [None]:
print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

In [None]:
X_train.head()

## CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [None]:
vect.get_feature_names()[::1000]

# Finding the number of features after the CountVectorizer

In [None]:
print('We now have',len(vect.get_feature_names()),'features after fitting the CountVectorizer on training set')

In [None]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

# Training the model

In [None]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression().fit(X_train_vectorized, y_train)

In [None]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
prediction = model.predict(vect.transform(X_test))

print('AUC:', roc_auc_score(y_test,prediction))

In [None]:
# Get feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()
#print(sorted_coef_index)
# Find the 10 smalles and 10 largest coefficients
# The 10 largest coefficients are bein indexed using [:-11:-1]

print('Coefficients related to job title 1 (Data Scientist):\n {}\n'.format(feature_names[sorted_coef_index[:20]]))
print('Coefficients related to job title 2 (Carpenter):\n {}\n'.format(feature_names[sorted_coef_index[:-21:-1]]))

# Improving the results:

As you can see, the model was able to find the keywords related to each job title. But there are still some words that are not related, such as "with", "unknown", and "00".<br>
For improving the results, we are going to use Tfidf

Also you can see that the model was able to find the keywords "machine", and "learning" related to Data Scientist job title separately, but the model was not able to detect that Machine Learning is considered as one term. We will use n-grams to improve the model regarding this issue.

# Tfidf

1. Tfidf is an statistical approach<br><br>

**Tf–idf, or Term frequency-inverse document frequency, allows us to weight terms based on how important they are to a document.
High weight is given to terms that appear often in a particular document, but don't appear often in the corpus.
Features with low tf–idf are either commonly used across all documents or rarely used and only occur in long documents.
Features with high tf–idf are frequently used within specific documents, but rarely used across all documents. 

## There is a possibility that Tfidf cant be a good aproach, because this metric calculates the number of times a word appears in a text (term frequency) and compares it with the inverse document frequency (how rare or common that word is in the entire data set)[](http://)

Also the words that appear more frequently in a group of documents are not necessarily the most relevant. Likewise, a word that appears in a single text but doesn’t appear in the remaining documents may be very important to understand the content of that text. 

# Now we are going to use RAKE (Rapid Automated Heyword Extraction)



# Also if we find a way to consider names only, that might be helpful as well. (Linguistic Approach)
Most systems that use some kind of linguistic information outperform those that don’t do so. We strongly recommend that you try some of them when extracting keywords from your texts.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer (term frequency–inverse document frequency) to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=50).fit(X_train)
len(vect.get_feature_names())

Here you can see that by using the Tfidf Vectorizer, 

In [None]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

You can see that we could get the same AUC with about 1/4 of the features. 

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()


print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

In [None]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:40]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-41:-1]]))

In [None]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:20]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-21:-1]]))

# n-grams <br><br>

https://monkeylearn.com/keyword-extraction/
Word Collocations and Co-occurrences

Also known as N-gram statistics, word collocations and co-occurrences can help you understand the semantic structure of a text and count separate words as one.

Collocations are words that frequently go together. The most common types of collocations are bi-grams (two terms that appear adjacently, like ‘customer service’, ‘video calls’ or ‘email notification’) and tri-grams (a group of three words, like ‘easy to use’ or ‘social media channels’). 

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Job Description'],
                                                   df['Group'],
                                                   random_state=0)


# Fit the CountVectorizer to the training data specifiying a minimum 
# document frequency of 5 and extracting 1-grams and 2-grams
vect = CountVectorizer(min_df=100, ngram_range=(1,3)).fit(X_train)

X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

In [None]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:40]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-41:-1]]))

## It seems that the feature names are not sorted properly. Also we need to take care of stop words.

# Fixing the issue

## Stopwords Removal[](http://)

In [None]:
# To see a list of stopwords in English

import nltk
from nltk.corpus import stopwords
set(stopwords.words('english'))

# Lowercasing the text

In [None]:
#df['Job Description'] = df['Job Description'].apply([lambda text: stop_word_remover(text) ])
df['Job Description'] = df['Job Description'].str.lower().str.replace(',', ' ').str.replace('.' , ' ').str.replace('  ',' ')

In [None]:
df['Job Description'].iloc[500]

# Stopword Removal using NLTK

In [None]:
# The following code is to remove stop words from sentence using nltk
# Created by - ANALYTICS VIDHYA

# importing libraries
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
set(stopwords.words('english'))

def stop_word_remover(text):


    # set of stop words
    stop_words = set(stopwords.words('english')) 
    #print(stop_words)
    # tokens of words  
    word_tokens = word_tokenize(text) 

    filtered_sentence = [] 

    for w in word_tokens: 
        if w not in stop_words: 
            filtered_sentence.append(w) 



    #print("\n\nOriginal Sentence \n\n")
    #print(" ".join(word_tokens)) 
    #print('***************************************')
    
    result = " ".join(filtered_sentence)
    #print(result) 
    return result

In [None]:
df['Job Description'] = df['Job Description'].apply(stop_word_remover)

In [None]:
df['Job Description'].iloc[500]

# Another apreach: adding our set of words to the stop words<br><br>
['i','me','my','myself','we','our','ours','ourselves','you','your','yours','yourself','yourselves','he','him','his','himself','she','her','hers','herself','it','its','itself','they','them','their','theirs','themselves','what','which','who','whom','this','that','these','those','am','is','are','was','were','be','been','being','have','has','had','having','do','does','did','doing','a','an','the','and','but','if','or','because','as','until','while','of','at','by','for','with','about','against','between','into','through','during','before','after','above','below','to','from','up','down','in','out','on','off','over','under','again','further','then','once','here','there','when','where','why','how','all','any','both','each','few','more','most','other','some','such','no','nor','not','only','own','same','so','than','too','very','s','t','can','will','just','don','should','now']


# We should also do a stemming so that words like ‘search’ or ‘searched’ or ‘searching’ which all mean ‘search’. This process of reducing word to its root is called stemming



In [None]:
#df.to_csv('test02.csv')
df['Group'].mean()

In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Job Description'],
                                                   df['Group'],
                                                   random_state=0)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Fit the CountVectorizer to the training data
vect = CountVectorizer().fit(X_train)

In [None]:
vect.get_feature_names()[::1000]

In [None]:
print('We now have',len(vect.get_feature_names()),'features after fitting the CountVectorizer on training set')

# We are facing a wide data set (p>n issue) so we may need to add more samples to the data set

In [None]:
# transform the documents in the training data to a document-term matrix
X_train_vectorized = vect.transform(X_train)

X_train_vectorized

In [None]:
from sklearn.linear_model import LogisticRegression

# Train the model
model = LogisticRegression().fit(X_train_vectorized, y_train)

In [None]:
from sklearn.metrics import roc_auc_score

# Predict the transformed test documents
prediction = model.predict(vect.transform(X_test))

print('AUC:', roc_auc_score(y_test,prediction))

In [None]:
# Get feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()
#print(sorted_coef_index)
# Find the 10 smalles and 10 largest coefficients
# The 10 largest coefficients are bein indexed using [:-11:-1]

print('Coefficients related to job title 1 (Data Scientist):\n {}\n'.format(feature_names[sorted_coef_index[:20]]))
print('Coefficients related to job title 2 (Carpenter):\n {}\n'.format(feature_names[sorted_coef_index[:-21:-1]]))

# Tfidf

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Fit the TfidfVectorizer (term frequency–inverse document frequency) to the training data specifiying a minimum document frequency of 5
vect = TfidfVectorizer(min_df=100, ngram_range=(1,3)).fit(X_train)
len(vect.get_feature_names())

In [None]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, predictions))

In [None]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()


print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

In [None]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:40]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-41:-1]]))

# Testing

In [None]:
text = df['Job Description'].iloc[100]
text

In [None]:
model.predict(vect.transform(['we need someone to work with analysis']))

In [None]:
df.head(-1)

In [None]:
input1 = "Statistics statistical lists listing listings"
words1 = input1.lower().split(' ')
words1

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
print(stemmer.stem("sql"))

print(stemmer.stem("statistical"))


In [None]:
import nltk
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

In [0]:
import pandas as pd
Jobs = pd.read_csv("../input/jobs-ds-carp/Jobs.csv")

In [0]:
import pandas as pd
Jobs = pd.read_csv("../input/jobs-ds-carp/Jobs.csv")