In [33]:
!pip install Keras-Preprocessing
!pip install rouge-score

[0mCollecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24954 sha256=c1f126387f1a5ee171029174cfa0bf8c3654bfc61a2e8b2d48eed7302064bf8d
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
[0m

Imports

In [2]:
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import pandas as pd
import numpy as np
import string, os 
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)



In [3]:
from bs4 import BeautifulSoup

Load Dataset

In [4]:
import os
import pandas as pd
combined_df = pd.read_csv('/kaggle/input/jobs-dataset/jd/job_descriptions_Marketing Intern.csv', usecols = ['description'], nrows=100)

HTML cleaning

In [6]:
for index in range(len(combined_df['description'])):
    html_string = combined_df['description'][index]
    soup = BeautifulSoup(html_string, "html.parser")
    combined_df['description'][index] = soup.get_text()

In [8]:
all_descriptions = list(combined_df.description.values)
len(all_descriptions)

100

Filtered Data

In [9]:
corpus = [x for x in all_descriptions]
corpus[:1]

["Marketing Intern (Summer 2022)\nTake your next career step at ABB with a global team that is energizing the transformation of society and industry to achieve a more productive, sustainable future. At ABB, we have the clear goal of driving diversity and inclusion across all dimensions: gender, LGBTQ+, abilities, ethnicity and generations. Together, we are embarking on a journey where each and every one of us, individually and collectively, welcomes and celebrates individual differences.\nABB’s Electrification organization is responsible for the go-to-market strategy and generating profitable growth for the Electrification Business Area. Our 10,000 strong commercial team represents the portfolio of all Electrification Business Area Divisions in over 100 countries. Our unmatched domain expertise across key industry verticals and channels combined with our truly global footprint makes us able to deliver extraordinary business results, supporting our customers with solutions which address

In [10]:
# print(t.word_counts)
# print(t.word_docs)
# print(t.document_count)
# print(t.word_index)

Tokenization

In [12]:
t = Tokenizer(num_words=vocabulary_size, filters='\n', lower=True, split=' ', char_level=False, oov_token=None, document_count=0)

def get_sequence_of_tokens(corpus):
    t.fit_on_texts(corpus)
    print('Found %s unique tokens.' % len(t.word_index))
    total_words = len(t.word_index) + 1
    
    input_sequences = []
    for line in corpus:
        token_list = t.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
            
    return input_sequences, total_words
input_sequences, total_words = get_sequence_of_tokens(corpus)
input_sequences[:10]

Found 7610 unique tokens.


[[9, 52],
 [9, 52, 3366],
 [9, 52, 3366, 3367],
 [9, 52, 3366, 3367, 331],
 [9, 52, 3366, 3367, 331, 29],
 [9, 52, 3366, 3367, 331, 29, 592],
 [9, 52, 3366, 3367, 331, 29, 592, 123],
 [9, 52, 3366, 3367, 331, 29, 592, 123, 1703],
 [9, 52, 3366, 3367, 331, 29, 592, 123, 1703, 32],
 [9, 52, 3366, 3367, 331, 29, 592, 123, 1703, 32, 1704]]

Pad Sequences

In [13]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes = total_words)
    
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(input_sequences)

W2V Embeddings Load

In [14]:
from gensim.models.keyedvectors import KeyedVectors
embs_path = '/kaggle/input/wikinews/wiki-news-300d-1M-subword.vec'
embeddings = KeyedVectors.load_word2vec_format(embs_path, binary=False)
dim = embeddings.vectors.shape[1]
pad = np.zeros(dim)
np.random.seed(3)
oov = np.random.uniform(-0.25, 0.25, dim)

W2V weight matrix

In [15]:
embedding_matrix_w2v = np.zeros((vocabulary_size, 300))
for word, index in t.word_index.items():
    if index > vocabulary_size - 1:
        break
    else:
        try:
            embedding_vector = embeddings[word]
            if embedding_vector is not None:
                embedding_matrix_w2v[index] = embedding_vector
        except:
            embedding_matrix_w2v[index] = oov

Model definition

In [40]:
def create_model(max_sequence_len, total_words):
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(vocabulary_size, 300, input_length=max_sequence_len-1, weights=[embedding_matrix_w2v]))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(100))
    model.add(Dropout(0.2))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 986, 300)          3000000   
                                                                 
 lstm_2 (LSTM)               (None, 100)               160400    
                                                                 
 dropout_2 (Dropout)         (None, 100)               0         
                                                                 
 dense_2 (Dense)             (None, 7611)              768711    
                                                                 
Total params: 3,929,111
Trainable params: 3,929,111
Non-trainable params: 0
_________________________________________________________________


Load model

In [58]:
from tensorflow import keras
model = keras.models.load_model('/kaggle/working/w2v_100')

Training

In [None]:
history = model.fit(predictors, label, epochs=100, verbose=1)

Save Model

In [65]:
model.save('w2v_100')

In [66]:
!zip -r w2v_100.zip /kaggle/working/w2v_100

  adding: kaggle/working/w2v_100/ (stored 0%)
  adding: kaggle/working/w2v_100/variables/ (stored 0%)
  adding: kaggle/working/w2v_100/variables/variables.data-00000-of-00001 (deflated 23%)
  adding: kaggle/working/w2v_100/variables/variables.index (deflated 56%)
  adding: kaggle/working/w2v_100/assets/ (stored 0%)
  adding: kaggle/working/w2v_100/fingerprint.pb (stored 0%)
  adding: kaggle/working/w2v_100/keras_metadata.pb (deflated 87%)
  adding: kaggle/working/w2v_100/saved_model.pb (deflated 90%)


Generate Text

In [18]:
def generate_text(seed_text, next_words, model, max_seq_len):
    for _ in range(next_words):
        token_list = t.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_seq_len-1, padding='pre')
        
        predict_x=model.predict(token_list)
        predicted=np.argmax(predict_x,axis=1)
#         predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ''
        
        for word,index in t.word_index.items():
            if index == predicted:
                output_word = word
                break
                
        seed_text = seed_text + " " + output_word
        
    return seed_text.title()

In [89]:
gen = generate_text("september", 100, model, max_sequence_len)



In [90]:
gen

'September 2022 - ( 220001G0 ) Description Is - The New Company Event That The Next Generation. And Be Better World. From Shared Your Services Through Our Community Manager To Ensure The Photo Team To Bring The Company. Other Hospital Stakeholders At All Levels. Ability To Work Independently And Efficiently In A Busy Environment Managing Multiple Projects, Shifting Priorities, And Tight Deadlines. Canada Summer Jobs Program Requirements: Placement Is Full Time Only (35 Hours Per Week) With A Minimum Duration Of Six Weeks And A Maximum Of 16 Weeks Placement Must Occur Between April 25, 2022 And September 3, 2022 Applicants'

In [37]:
from rouge_score import rouge_scorer
import numpy as np

def calculate_rouge_score(target, predicted):
    '''
    target:    a list of strings containing the summarizations as the ground truth
    predicted: a list of strings containing the summarizations from the model
    '''

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL', 'rougeLsum'], use_stemmer=True)
    rouge1_f1_scores = []
    rouge2_f1_scores = []
    rougeL_f1_scores = []
    rougeLsum_f1_scores = []
    for i in range(len(predicted)):
        scores = scorer.score(target[i], predicted[i])
        rouge1_f1_scores.append(scores['rouge1'][2])
        rouge2_f1_scores.append(scores['rouge2'][2])
        rougeL_f1_scores.append(scores['rougeL'][2])
        rougeLsum_f1_scores.append(scores['rougeLsum'][2])
    
    return {'rouge1': np.array(rouge1_f1_scores).mean(),
          'rouge2': np.array(rouge2_f1_scores).mean(),
          'rougeL': np.array(rougeL_f1_scores).mean(),
          'rougeLsum': np.array(rougeLsum_f1_scores).mean()}

In [69]:
corpus[1].replace('\n', '')

'Marketing InternThe MRG Group - HospitalityOttawa,ONThe MRG Group is looking for a Marketing Intern to join our team and gain valuable experience that pertains to their studies.The MRG Group is an industry leader in concerts, hospitality, live entertainment, lifestyle and events. Our mission is to create Positive Shareable Experiences for everyone involved with our businesses.The MRG Group by the numbers in 2021:8 Hospitality Properties across Canada1000+ live shows per year via the largest Independent Concert Promotions Company in Canada, MRG Live5 Live Entertainment Venues10+ Large Scale Events per year (2019)MRG Travel - Curating Travel ExperiencesAdmit One - Ticketing PlatformBeatroute - Global lifestyle digital media companyAs an important part of the marketing team, you will be responsible for assisting the hospitality marketing team with executing marketing and social media initiatives for properties in Ottawa, The Prescott and Par Tee Putt.Reporting into the Hospitality Market

In [48]:
calculate_rouge_score(corpus[1], gen)

{'rouge1': 0.044170890658942794,
 'rouge2': 0.0,
 'rougeL': 0.044170890658942794,
 'rougeLsum': 0.044170890658942794}