# Semantic Search Project 

## Finding hot keywords using feature extraction methods
-------------------
*By Fady Shehata*




In [48]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import os

import re
import nltk.corpus
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import digits
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

import spacy
from collections import Counter
from string import punctuation

from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF
import sklearn.metrics as metrics

import itertools
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences

from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.models import Sequential

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [49]:
#import data
train = pd.read_csv('BBC News Train.csv')
test = pd.read_csv('BBC News Test.csv')


# Exploratory Analysis (EDA)


In [51]:
train  #display training data records

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [52]:
train.info()  #check the type of data, null value counts and number of entries

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


In [None]:
train['Category'].unique()  #display data labels (Extra)

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [53]:
#data preprocessing

def clean_text(dataframe, text_col):

    #remove all punctuation
    dataframe['no_punct'] = dataframe[text_col].apply(lambda row: re.sub(r'[^\w\s]+', '', row))
    
    #remove numbers 
    dataframe['no_punct_num'] = dataframe['no_punct'].apply(lambda row: re.sub(r'[0-9]+', '', row))
    
    #remove stopwords
    stop_words = stopwords.words('english')
    dataframe['no_stopwords'] = dataframe['no_punct_num'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
    
    #remove extra spaces
    dataframe['clean_text'] = dataframe['no_stopwords'].apply(lambda x: re.sub(' +', ' ', x))
    return 

clean_text(train, 'Text')

In [54]:
# tokenize and lemmatize text  

wordnet_lemmatizer = WordNetLemmatizer()
def lemmatizer(text):
    lem = [wordnet_lemmatizer.lemmatize(word.lower()) for word in text] 
    return lem

def tokenize_lemmatize(dataframe, text_col):
    dataframe['tokenized'] = dataframe.apply(lambda row: nltk.word_tokenize(row[text_col]), axis=1)
    dataframe['lemmatized'] = dataframe['tokenized'].apply(lambda string: lemmatizer(string))
    dataframe['num_words'] = dataframe['lemmatized'].apply(lambda lst: len(lst))
    return

tokenize_lemmatize(train, 'clean_text')

In [56]:
#display final result words for one sample article

train['lemmatized'][1]

['german',
 'business',
 'confidence',
 'slide',
 'german',
 'business',
 'confidence',
 'fell',
 'february',
 'knocking',
 'hope',
 'speedy',
 'recovery',
 'europe',
 'largest',
 'economy',
 'munichbased',
 'research',
 'institute',
 'ifo',
 'said',
 'confidence',
 'index',
 'fell',
 'february',
 'january',
 'first',
 'decline',
 'three',
 'month',
 'study',
 'found',
 'outlook',
 'manufacturing',
 'retail',
 'sector',
 'worsened',
 'observer',
 'hoping',
 'confident',
 'business',
 'sector',
 'would',
 'signal',
 'economic',
 'activity',
 'picking',
 'surprised',
 'ifo',
 'index',
 'taken',
 'knock',
 'said',
 'dz',
 'bank',
 'economist',
 'bernd',
 'weidensteiner',
 'main',
 'reason',
 'probably',
 'domestic',
 'economy',
 'still',
 'weak',
 'particularly',
 'retail',
 'trade',
 'economy',
 'labour',
 'minister',
 'wolfgang',
 'clement',
 'called',
 'dip',
 'february',
 'ifo',
 'confidence',
 'figure',
 'mild',
 'decline',
 'said',
 'despite',
 'retreat',
 'index',
 'remained',
 're

# Feature Extraction

Apply feature extraction which helps in finding hot keywords

# - Pos Tagging method

In [57]:
'''
use pre-trained model which contains such as 
tokenization, part-of-speech tagging, and more 
to analyze text
'''
nlp = spacy.load("en_core_web_sm")

def get_hotwords(text):
    result = []
    pos_tag = ['PROPN', 'ADJ', 'NOUN'] 
    doc = nlp(text.lower()) 
    for token in doc:
        if(token.text in nlp.Defaults.stop_words or token.text in punctuation):
            continue    
        if(token.pos_ in pos_tag): #find pos taggs for text 
            result.append(token.text)
    return result


output = set(get_hotwords(train['clean_text'][1]))  #get unique value for hot keywords returned from the function

most_common_list = Counter(output).most_common(10)  #get most common 10 keywords

#display them
for item in most_common_list:
  print(item[0])

confident
unions
knock
unemployment
figures
wolfgang
confidence
volkswagen
things
dip


# - Word Embedding method

In [58]:
'''
use Word2Vec pre-trained model to represent the words as vectors
this method helps us to find the most similar words for a specific word in the atricle 
'''

#apply get word embeddings for one sample article 
model = gensim.models.Word2Vec([train['tokenized'][1]], size=100, window=5, min_count=1)
model.save("word2vec.model")

sims = model.wv.most_similar('economy', topn=10)  #display similar words to the word 'economy for example'
sims



[('fears', 0.2052503526210785),
 ('modest', 0.17909425497055054),
 ('probably', 0.16303302347660065),
 ('hopes', 0.15738312900066376),
 ('despite', 0.15249696373939514),
 ('grew', 0.14367607235908508),
 ('may', 0.14023339748382568),
 ('president', 0.13618412613868713),
 ('less', 0.13561111688613892),
 ('german', 0.13234589993953705)]

# - TF-IDF method

In [15]:
train_df = train.copy()

In [70]:
#create vectorizer
tfidvec = TfidfVectorizer(min_df = 2,
                          max_df = 0.95,
                          norm = 'l2',
                          stop_words = 'english')
tfidvec_train = tfidvec.fit_transform(train_df['clean_text'])


In [75]:
feature_names = tfidvec.get_feature_names_out()  #method is used to get the list of terms in the vocabulary

top_n = 10 # specify the number of top keywords to extract for each document


tfidf_scores = tfidvec_train[1].toarray().flatten()  #use one sample article to get hot keywords from it

top_indices = tfidf_scores.argsort()[-top_n:][::-1]  # get indices of top keywords

top_keywords = [feature_names[idx] for idx in top_indices]  #get the top keywords

print({', '.join(top_keywords)}) #displa them

{'german, economy, confidence, index, february, manufacturing, retail, business, decline, exports'}


# - NFM Model

In [82]:
#create model
nmf_model = NMF(n_components=5, 
                init='nndsvda', 
                solver = 'mu',
                beta_loss = 'kullback-leibler',
                l1_ratio = 0.5,
                random_state = 101)

nmf_model.fit(tfidvec_train) #fit the tf-idf feature data to the model

In [81]:
#find hot keywords in the 5 topics 
for topic_idx, topic in enumerate(nmf_model.components_):
    print("Topic %d:" % (topic_idx+1))
    print(" ".join([feature_names[i] for i in topic.argsort()[:-6:-1]]))

Topic 1:
game england win said cup
Topic 2:
mr said labour blair election
Topic 3:
mobile people said software technology
Topic 4:
film best awards music band
Topic 5:
bn said growth market economy


# - LSTM Model


Prepare data by handeling the data labels and balance classes, so the data could be ready to be fitted in the models



In [16]:
train_df['Category'].value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [59]:
#balanced classes
num_of_categories = 45000
shuffled = train_df.reindex(np.random.permutation(train_df.index))
sport    = shuffled[shuffled['Category'] == 'sport'][:num_of_categories]
business = shuffled[shuffled['Category'] == 'business'][:num_of_categories]
politics = shuffled[shuffled['Category'] == 'politics'][:num_of_categories]
entertainment = shuffled[shuffled['Category'] == 'entertainment'][:num_of_categories]
tech = shuffled[shuffled['Category'] == 'tech'][:num_of_categories]
concated = pd.concat([sport,business,politics,entertainment,tech], ignore_index=True)
#Shuffle the dataset
concated = concated.reindex(np.random.permutation(concated.index))
concated['LABEL'] = 0

In [60]:
#One-hot encode for the labels
concated.loc[concated['Category'] == 'sport', 'LABEL'] = 0
concated.loc[concated['Category'] == 'business', 'LABEL'] = 1
concated.loc[concated['Category'] == 'politics', 'LABEL'] = 2
concated.loc[concated['Category'] == 'entertainment', 'LABEL'] = 3
concated.loc[concated['Category'] == 'tech', 'LABEL'] = 4
print(concated['LABEL'][:10])
labels = to_categorical(concated['LABEL'], num_classes=5)
print(labels[:10])
if 'Category' in concated.keys():
    concated.drop(['Category'], axis=1)

958     3
1242    4
65      0
269     0
1444    4
784     2
828     2
594     1
260     0
791     2
Name: LABEL, dtype: int64
[[0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1.]
 [0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0.]]


In [61]:
#tokenize the cleaned text and  
n_most_common_words = 8000
max_len = 130
tokenizer = Tokenizer(num_words=n_most_common_words, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(concated['clean_text'].values)
sequences = tokenizer.texts_to_sequences(concated['clean_text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X = pad_sequences(sequences, maxlen=max_len) #pad the sequences with the same length to fit in the model

Found 25231 unique tokens.


In [62]:
#split the data into train and test so the model can be trained better

X_train, X_test, y_train, y_test = train_test_split(X , labels, test_size=0.25, random_state=42)

In [64]:
#intialize hyperparameters with random values 
epochs = 30
emb_dim = 128
batch_size = 64

In [65]:
#build a sequential DL model  
model = Sequential()
model.add(Embedding(n_most_common_words, emb_dim, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.7))
model.add(LSTM(64, dropout=0.7, recurrent_dropout=0.7))
model.add(Dense(5, activation='softmax'))

In [66]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [38]:
history = model.fit(X_train, y_train, 
                    epochs=epochs, 
                    batch_size=batch_size,
                    validation_split=0.2,
                    callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.01)])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [46]:
max_words = 10000
# Extract the relevant words using LSTM weights
weights = model.layers[0].get_weights()[0]  # get the weights of the LSTM layer
keywords = []

for word, idx in word_index.items():
    if idx < max_words and idx < len(weights[0]):  # only consider the top max_words words
        vec = np.expand_dims(weights[:, idx], axis=1)
        score = np.dot(vec.T, weights).squeeze()
        if score.any() > 0.5:  # set a threshold to determine if the word is relevant
            keywords.append(word)

print(keywords)


['said', 'mr', 'would', 'also', 'new', 'people', 'us', 'year', 'one', 'could', 'first', 'last', 'two', 'world', 'uk', 'time', 'government', 'years', 'film', 'bn', 'best', 'make', 'told', 'get', 'made', 'many', 'game', 'three', 'number', 'like', 'music', 'next', 'labour', 'back', 'bbc', 'set', 'may', 'take', 'well', 'added', 'way', 'market', 'says', 'good', 'home', 'election', 'england', 'firm', 'still', 'win', 'company', 'blair', 'use', 'show', 'going', 'since', 'say', 'week', 'games', 'party', 'work', 'top', 'million', 'go', 'much', 'play', 'want', 'mobile', 'minister', 'second', 'part', 'used', 'public', 'think', 'see', 'british', 'even', 'group', 'players', 'country', 'end', 'european', 'technology', 'plans', 'however', 'expected', 'news', 'sales', 'put', 'service', 'tv', 'brown', 'already', 'five', 'months', 'london', 'growth', 'net', 'six', 'former', 'four', 'deal', 'tax', 'britain', 'director', 'economy', 'services', 'help', 'come', 'money', 'big', 'according', 'chief', 'record',