**Importing and downloading necessary libraries**

In [None]:
import numpy as np
import pandas as pd
import re
import random
from scipy import spatial
import networkx as nx

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec

In [None]:
nltk.download('punkt')
stop_words = stopwords.words('english')

**Importing a dataset containing gold summaries (for reference)**

In [None]:
df = pd.read_csv("../input/bbc-articles/BBCarticles_csv.csv", encoding = "unicode_escape")
df.head()

**Preprocessing the dataset**

In [None]:
df = df.dropna().reset_index()
df['Text'] = df['Text'].apply(lambda x: x.replace('\n',' '))
df['Summary'] = df['Summary'].apply(lambda x: x.replace('\n',' '))
df.head()

**Setting up inputs**

In [None]:
rand = random.randint(0,df.shape[0])
print(rand)
sample_text = df.iloc[rand,2]
gold_summary = df.iloc[rand,1]
print("Text: ", sample_text)
print("Gold Summary: ", gold_summary)

#sample_text = "Record year for Chilean copper  Chile's copper industry has registered record earnings of $14.2bn in 2004, the governmental Chilean Copper Commission (Cochilco) has reported.  Strong demand from China's fast-growing economy and high prices have fuelled production, said Cochilco vice president Patricio Cartagena. He added that the boom has allowed the government to collect $950m in taxes. Mr Cartagena said the industry expects to see investment worth $10bn over the next three years.  'With these investments, clearly we are going to continue being the principle actor in the mining of copper. It's a consolidation of the industry with new projects and expansions that will support greater production.' Australia's BHP Billiton - which operates La Escondida, the world's largest open pit copper mine - is planning to invest $1.9bn between now and 2007, while state-owned Codelco will spend about $1bn on various projects. Chile, the biggest copper producer in the world, is now analyzing ways of to keep prices stable at their current high levels, without killing off demand or leading customers to look for substitutes for copper. The copper price reached a 16-year high in October 2004. Production in Chile is expected rise 3.5% in 2005 to 5.5 million tonnes, said Mr Cartagena. Cochilco expects for 2005 a slight reduction on copper prices and forecasts export earnings will fall 10.7%. "

**Preprocessing the input**

Tokenizing sentences

In [None]:
tokenized_sentences = sent_tokenize(sample_text)
num_sentences = len(tokenized_sentences)
print(num_sentences)

Removing punctuation, stopwords and coverting to lower case

In [None]:
preprocessed_sentences =[re.sub(r'[^\w\s]','',sentence.lower()) for sentence in tokenized_sentences]
sentence_tokens=[[words for words in sentence.split(' ') if words not in stop_words] for sentence in preprocessed_sentences]

In [None]:
print(sentence_tokens)

In [None]:
w2v=Word2Vec(
    sentence_tokens,
    min_count=1,
    vector_size=1,
    epochs=1000,
    sg=0)
vocab = w2v.wv.key_to_index.keys()

In [None]:
sentence_embeddings=[[w2v.wv.get_index(word) for word in words] for words in sentence_tokens]
max_len=max([len(tokens) for tokens in sentence_tokens])
sentence_embeddings=[np.pad(embedding,(0,max_len-len(embedding)),'constant') for embedding in sentence_embeddings]

In [None]:
similarity_matrix = np.zeros([len(sentence_tokens), len(sentence_tokens)])
for i,row_embedding in enumerate(sentence_embeddings):
    for j,column_embedding in enumerate(sentence_embeddings):
        similarity_matrix[i][j]=1-spatial.distance.cosine(row_embedding,column_embedding)

In [None]:
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [None]:
print("Number of sentences in text: ", num_sentences)

In [None]:
num_sent_sum = 5

In [None]:
top_sentence={sentence:scores[index] for index,sentence in enumerate(tokenized_sentences)}
top=dict(sorted(top_sentence.items(), key=lambda x: x[1], reverse=True)[:num_sent_sum])

In [None]:
extractive_summary = ''
for sent in tokenized_sentences:
    if sent in top.keys():
        extractive_summary+= sent

In [None]:
print("\nText: ", sample_text)
print("\nGold Summary: ", gold_summary)
print("\nExtractive Summary: ", extractive_summary)