# Import Libraries

In [1]:
import os
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

# Read Documents

In [2]:
print(os.listdir("../input"))
print(os.listdir("../input/glove6b300dtxt"))
print(os.listdir("../input/wikiarticles"))

['glove6b300dtxt', 'wikiarticles']
['glove.6B.300d.txt']
['SData.txt']


In [3]:
documents = []
f = open('../input/wikiarticles/SData.txt', encoding="utf-8", errors='ignore')
for line in f:
    if line!='\n':
        print(line)
        article = sent_tokenize(line)
        documents.append(article)
f.close()

RMS Titanic was a British passenger liner that sank in the North Atlantic Ocean in 1912 after the ship struck an iceberg during her maiden voyage from Southampton to New York City. Of the estimated 2,224 passengers and crew aboard, more than 1,500 died, making it one of modern history's deadliest peacetime commercial marine disasters. RMS Titanic was the largest ship afloat at the time she entered service and was the second of three Olympic class ocean liners operated by the White Star Line. She was built by the Harland and Wolff shipyard in Belfast. Thomas Andrews, chief naval architect of the shipyard at the time, died in the disaster.Titanic was under the command of Capt. Edward Smith, who also went down with the ship. The ocean liner carried some of the wealthiest people in the world, as well as hundreds of emigrants from Great Britain and Ireland, Scandinavia and elsewhere throughout Europe who were seeking a new life in the United States. The first class accommodation was designe

In [4]:
print(documents[0:1])

[['RMS Titanic was a British passenger liner that sank in the North Atlantic Ocean in 1912 after the ship struck an iceberg during her maiden voyage from Southampton to New York City.', "Of the estimated 2,224 passengers and crew aboard, more than 1,500 died, making it one of modern history's deadliest peacetime commercial marine disasters.", 'RMS Titanic was the largest ship afloat at the time she entered service and was the second of three Olympic class ocean liners operated by the White Star Line.', 'She was built by the Harland and Wolff shipyard in Belfast.', 'Thomas Andrews, chief naval architect of the shipyard at the time, died in the disaster.Titanic was under the command of Capt.', 'Edward Smith, who also went down with the ship.', 'The ocean liner carried some of the wealthiest people in the world, as well as hundreds of emigrants from Great Britain and Ireland, Scandinavia and elsewhere throughout Europe who were seeking a new life in the United States.', 'The first class a

# Preprocessing

In [5]:
#Remove Punctuations
clean_docs = []
punctuations = "\\`*_,{}[]()>#+-.\"\'!;:$"
for d in documents:
    clean_sents = []
    for s in d:
        for p in punctuations:
            s = s.replace(p,' ')
        clean_sents.append(s.lower())
    clean_docs.append(clean_sents)
print(clean_docs)

[['rms titanic was a british passenger liner that sank in the north atlantic ocean in 1912 after the ship struck an iceberg during her maiden voyage from southampton to new york city ', 'of the estimated 2 224 passengers and crew aboard  more than 1 500 died  making it one of modern history s deadliest peacetime commercial marine disasters ', 'rms titanic was the largest ship afloat at the time she entered service and was the second of three olympic class ocean liners operated by the white star line ', 'she was built by the harland and wolff shipyard in belfast ', 'thomas andrews  chief naval architect of the shipyard at the time  died in the disaster titanic was under the command of capt ', 'edward smith  who also went down with the ship ', 'the ocean liner carried some of the wealthiest people in the world  as well as hundreds of emigrants from great britain and ireland  scandinavia and elsewhere throughout europe who were seeking a new life in the united states ', 'the first class a

In [6]:
#Remove stopwords
clean_docs2 = []
stopw = stopwords.words('english')
for d in clean_docs:
    clean_sents2 = []
    for s in d:
        words = s.split()
        sen_new = " ".join([w for w in words if w not in stopw])
        clean_sents2.append(sen_new)
    clean_docs2.append(clean_sents2)

# Fetch Word Embeddings

In [7]:
word2vec = {}
embeddings = open('../input/glove6b300dtxt/glove.6B.300d.txt', encoding='utf-8')
for line in embeddings:
    components = line.split()
    word = components[0]
    vector = np.asarray(components[1:], dtype='float32')
    word2vec[word] = vector
embeddings.close()

# Create Sentence Embeddings

In [8]:
sentence_vectors = []
for d in clean_docs2:
    vectors = []
    for s in d:
      if len(s) != 0:
        words = s.split()
        v = sum([word2vec.get(w, np.zeros((300,))) for w in words])/(len(words)+0.001)
      else:
        v = np.zeros((300,))
      vectors.append(v)
    sentence_vectors.append(vectors)

In [9]:
l = []
for d in clean_docs2:
    l.append(len(d))
    

# Similarity and Ranking

In [10]:
similarity_matrices = []
for i, d in enumerate(clean_docs2):
    sim_mat = np.zeros((l[i],l[i]), dtype='float32')
    #similarity_matrices.append(sim_mat)
    for x in range(l[i]):
      for y in range(l[i]):
        if x != y:
          sim_mat[x][y] = cosine_similarity(sentence_vectors[i][x].reshape(1,300), sentence_vectors[i][y].reshape(1,300))[0,0]
    similarity_matrices.append(sim_mat)

In [11]:
doc_scores = []
doc_ranked = []
for x in range(len(l)):
    nx_graph = nx.from_numpy_array(similarity_matrices[x])
    scores = nx.pagerank(nx_graph)
    ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(documents[x])), reverse=True)
    doc_scores.append(scores)
    doc_ranked.append(ranked_sentences)

In [12]:
# Generate summary
summaries = []
for x in range(len(l)):
    summary = []
    sn = int(0.25*l[x]) #No. of sentences to form the summary = 1/4th of total no. of sentences in the doc
    for i in range(sn):
      summary.append(doc_ranked[x][i][1])
    summaries.append(summary)

In [13]:
op = open("Summarized.txt", "a+")
for i,sum in enumerate(summaries):
    op.writelines(sum)
    op.write("\n")
    op.write("\n")
op.close()