In [1]:
import pandas as pd #Dataframe Manipulation library
import numpy as np #Data Manipulation library
from pathlib import Path

#sklearn modules for Feature Extraction & Modelling
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

#Libraries for Plotting 
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter

#Read files Iteratively
import glob
import os

In [2]:
def load_data(folder_names, root_path):
    fileNames = [path + "./BBC News Summary/News Articles/" + folder + '/' + "*.txt"
        for path,folder in zip([root_path]*len(folder_names), folder_names)]

    doc_list = []
    tags = folder_names
    for docs in fileNames:
        #print(docs)
        doc = glob.glob(docs)#glob method iterates through all files and reads the text in documents in the folders
        for text in doc:
            with open(text, encoding="latin-1") as f:
                topic = docs.split('/')[len(docs.split('/'))-2]
                lines = f.readlines()
                heading = lines[0].strip()#stripping the text by spaces and using first element into heading
                body = ' '.join([l.strip() for l in lines[1:]])
                doc_list.append([topic,heading,body])
        print(f"Loading data from \033[1m{topic}\033[0m directory")
    print("\nEntire Data is loaded successfully")
    
    return doc_list
folder_names = ['business','entertainment','politics','sport','tech']
docs = load_data(folder_names=folder_names,root_path=os.getcwd())

Loading data from [1mbusiness[0m directory
Loading data from [1mentertainment[0m directory
Loading data from [1mpolitics[0m directory
Loading data from [1msport[0m directory
Loading data from [1mtech[0m directory

Entire Data is loaded successfully


In [3]:
pd.read_csv('headings.csv')

Unnamed: 0.1,Unnamed: 0,Category,Heading,Article
0,0,business,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,1,business,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,2,business,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,3,business,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,4,business,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...
...,...,...,...,...
2220,2220,tech,BT program to beat dialler scams,BT is introducing two initiatives to help bea...
2221,2221,tech,Spam e-mails tempt net shoppers,Computer users across the world continue to i...
2222,2222,tech,Be careful how you code,A new European directive could put software w...
2223,2223,tech,US cyber security chief resigns,The man making sure US computer networks are ...


In [4]:
data = pd.DataFrame(docs, columns = ['Category','Heading','Article'])
data.head()

Unnamed: 0,Category,Heading,Article
0,business,Ad sales boost Time Warner profit,Quarterly profits at US media giant TimeWarne...
1,business,Dollar gains on Greenspan speech,The dollar has hit its highest level against ...
2,business,Yukos unit buyer faces loan claim,The owners of embattled Russian oil giant Yuk...
3,business,High fuel prices hit BA's profits,British Airways has blamed high fuel prices f...
4,business,Pernod takeover talk lifts Domecq,Shares in UK drinks and food firm Allied Dome...


In [5]:
data.to_csv('headings.csv')

In [6]:
vectorizer = TfidfVectorizer(stop_words = 'english')

In [7]:
vectors = vectorizer.fit_transform(data["Heading"].values) # .values: convert DataFrame columns into List.List of data will be transformed into tfidf vector
print(f"The shape of the tfidf |matrix : {vectors.shape}")
print(f"There are {vectors.shape[0]} number of News Articles having {vectors.shape[1]} unique words in tfidf vectors")

The shape of the tfidf |matrix : (2225, 3623)
There are 2225 number of News Articles having 3623 unique words in tfidf vectors


In [8]:
new_query = ["Stock Market Rates are rising"]
new_query_vector = vectorizer.transform(new_query)
new_query_vector

<1x3623 sparse matrix of type '<class 'numpy.float64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [9]:
sim = cosine_similarity(X = vectors, Y = new_query_vector)

In [10]:
sim

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [0.]])

In [11]:
ind = np.argsort(sim,axis = 0)[::-1][:10]
print("Top 10 News Articles similar to new_query are : \n")
for i in ind:
    print(data['Heading'].values[i])

Top 10 News Articles similar to new_query are : 

['Stock market eyes Japan recovery']
['US interest rates increased to 2%']
['French consumer spending rising']
["Labour's core support takes stock"]
['Home loan approvals rising again']
["Small firms 'hit by rising costs'"]
['UK interest rates held at 4.75%']
['Australia rates at four year high']
['Bank set to leave rates on hold']
['Bank opts to leave rates on hold']


In [12]:
#Extract Index of Maximum Valued similar document
argmax = np.argmax(sim)
print(f"Index of the maximum valued similar document is : \033[1m{argmax}\033[0m")
print(f"Retrieved Document Header is : \033[1m{data.Heading[argmax]}\033[0m")

Index of the maximum valued similar document is : [1m282[0m
Retrieved Document Header is : [1mStock market eyes Japan recovery[0m


In [13]:
def retrieve_doc(new_query,raw_docs,colname = None): # inputs are new_query,corpus,colname from the dataframe to be used for raw document text
    vectorizer = TfidfVectorizer(stop_words = 'english') #convert to Tfidf Vectorizer
    vectors = vectorizer.fit_transform(raw_docs[colname]) #preprocess the document, fit the model of tfidf document, transform it
    print(f"The shape of the tfidf matrix : {vectors.shape}")
    print(f"There are {vectors.shape[0]} number of News Articles having {vectors.shape[1]} unique words in tfidf vectors")
    new_query = [new_query] #tfidf vectorizer accepts on list or an array(doesn't work on raw text)
    new_query_vector = vectorizer.transform(new_query) #just transforms/calculates the frequency(of new_query) against the tokens we already have in matrix 
    new_query_vector
    sim = cosine_similarity(X = vectors, Y = new_query_vector)#pairwise cosine similarity
    argmax = np.argmax(sim)
    print(f"\nIndex of the maximum valued similar document is : \033[1m{argmax}\033[0m")
    print(f"Retrieved Document Header is : \033[1m{data.Heading[argmax]}\033[0m")
    ind = np.argsort(sim,axis = 0)[::-1][:10] #sorts similarity scores in [::-1] descending order ,[:10] top 10 most similar articles
    print("\nTop 10 News Articles similar to new_query are : \n")
    for i in ind:
        print(data.Heading.values[i])#prints the Headings of the top 10 similar articles

In [14]:
new_query = "IBM Technology Services"
retrieve_doc(new_query,raw_docs=data,colname =  "Article")

The shape of the tfidf matrix : (2225, 28980)
There are 2225 number of News Articles having 28980 unique words in tfidf vectors

Index of the maximum valued similar document is : [1m2091[0m
Retrieved Document Header is : [1mIBM frees 500 software patents[0m

Top 10 News Articles similar to new_query are : 

['IBM frees 500 software patents']
['IBM puts cash behind Linux push']
['IBM puts cash behind Linux push']
['Why Cell will get the hard sell']
["Profits stall at China's Lenovo"]
['When invention turns to innovation']
['PlayStation 3 processor unveiled']
['Supercomputer breaks speed record']
["Mobiles 'not media players yet'"]
["Mobiles 'not media players yet'"]
