In [1]:
# textsumExample to get a feel for text summerization
# importing libraries for text summerization

import numpy as np
import pandas as pd
import nltk
nltk.download('punkt') # one time execution, check to see if you've installed module
import re

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acmoua87\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# create dataframe for article to munipulate

df = pd.read_csv("job_app.csv")

In [3]:
# check dataframe contents to confirm creation
# .head() grabs the first 5, .tail() grabs the last 5 

df.head()

Unnamed: 0,article_id,article_text,source
0,1,Revature is the fastest growing employer of em...,https://www.glassdoor.com/job-listing/software...


In [4]:
# We have 3 columns in our dataset 
# We are most interested in the ‘article_text’ column as it contains the text of the articles

# some of the values of the variable just to see what they look like.
df['article_text'][0]

"Revature is the fastest growing employer of emerging technology talent in the US and we are currently looking to hire over 100 new Software Engineers. Our Software Engineers design, analyze and build next-gen software systems, including business applications, games, computer applications, middleware, and network control systems across a variety of industries, including finance, insurance, retail, healthcare and government. Revature has been featured in the Wall Street Journal, Money, Time, on MSN, and was recently named as one of the 8 Cool Companies to Apply to With Awesome Benefits by Glassdoor. Join us and be part of the next generation of Software Engineers. Interviews are starting now! What We Are Looking For: 0-3 years experience Solid foundational knowledge of SQL A natural problem solver Strong communication and interpersonal skills Ability to relocate Eligible to work in the US Revature is not currently sponsoring work visas or transfers at this time. What We Offer: Competiti

In [5]:
#df['article_text'][1]

In [6]:
#df['article_text'][2]

In [7]:
# Now the next step is to break the text into individual sentences. 
# We will use the sent_tokenize( ) function of the nltk library to do this.

from nltk.tokenize import sent_tokenize
sentences = []
for s in df['article_text']:
  sentences.append(sent_tokenize(s))

sentences = [y for x in sentences for y in x] # flatten list


In [8]:
# print a few elements of the list sentences.

sentences[:5]

['Revature is the fastest growing employer of emerging technology talent in the US and we are currently looking to hire over 100 new Software Engineers.',
 'Our Software Engineers design, analyze and build next-gen software systems, including business applications, games, computer applications, middleware, and network control systems across a variety of industries, including finance, insurance, retail, healthcare and government.',
 'Revature has been featured in the Wall Street Journal, Money, Time, on MSN, and was recently named as one of the 8 Cool Companies to Apply to With Awesome Benefits by Glassdoor.',
 'Join us and be part of the next generation of Software Engineers.',
 'Interviews are starting now!']

In [9]:
# Extract word vectors
# using pre-trained Wikipedia 2014 + Gigaword 5 GloVe vectors 
# you can download it @ https://nlp.stanford.edu/data/glove.6B.zip

word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [10]:
len(word_embeddings)

400000

In [11]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

In [12]:
# Get rid of the stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acmoua87\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
# import the stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [14]:
# function to remove these stopwords from our dataset
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

In [15]:
# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]

In [16]:
# create vectors for our sentences
sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [17]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

# initialize the matrix with cosine similarity scores.
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [19]:
# Applying PageRank Algorithm
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)


In [20]:
#Summary Extraction
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [21]:
import math
# Extract top 10 sentences as the summary

for i in range(math.ceil(len(ranked_sentences)/3)):
  print(ranked_sentences[i][1])

What We Offer: Competitive Salary Relocation Assistance Corporate Housing Health, Vision and Dental Insurance Paid Time Off Enterprise level development training Life Insurance 401K Mentoring and on-going support throughout your entire Revature career Experience with one of the world's largest and most reputable companies in the US Suitable candidates are encouraged to apply immediately Not Mentioned
Revature is the fastest growing employer of emerging technology talent in the US and we are currently looking to hire over 100 new Software Engineers.
What We Are Looking For: 0-3 years experience Solid foundational knowledge of SQL A natural problem solver Strong communication and interpersonal skills Ability to relocate Eligible to work in the US Revature is not currently sponsoring work visas or transfers at this time.
