Ex.1 TextRank

In [1]:
import string
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import contractions

nltk.download("stopwords")
stop_words = stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/falaputin2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def load_glove_vectors(fn):
    print("Loading Glove Model")
    with open(fn, "r", encoding="utf8") as glove_vector_file:
        model = {}
        for line in glove_vector_file:
            parts = line.split()
            word = parts[0]
            embedding = np.array([float(val) for val in parts[1:]])
            model[word] = embedding
        print("Loaded {} words".format(len(model)))
    return model


glove_vectors = load_glove_vectors("glove.6B.50d.txt")

Loading Glove Model
Loaded 400000 words


In [3]:
articles = (
    pd.read_csv("./small_wikidump.csv")
    .sample(10, random_state=42)
    .reset_index(drop=True)
)
CLEAN_PATTERN = r"[^a-zA-z\s]"


def clean(word):
    return re.sub(CLEAN_PATTERN, "", word)


def clean_sentence(sentence):
    sentence = [clean(word) for word in sentence]
    if len(sentence) == 0:
        print("empty")
        sentence = ["PAD"]
    return [word for word in sentence if word]


def clean_sentences(sentences):
    return [clean_sentence(sentence) for sentence in sentences]


def lower(sentence):
    return [word.lower() for word in sentence]


def remove_stopwords(sentence):
    words = [word for word in sentence if word not in stop_words]
    return [word for word in words if len(word) > 0]


def tokenize_words(sentences):
    return [word_tokenize(sentence) for sentence in sentences]


def fix_contractions(sentences):
    return [contractions.fix(sentence) for sentence in sentences]


def remove_n(text):
    return text.replace("\n", " ")


def pad_empty(word_senteces):
    new_word_sentences = []
    for word_sentece in word_senteces:
        if len(word_sentece) == 0:
            new_word_sentences.append(["PAD"])
        else:
            new_word_sentences.append(word_sentece)
    return new_word_sentences


articles["text"] = articles.text.apply(remove_n)
articles["SentencesInArticle"] = articles.text.apply(sent_tokenize)
articles["WordsInSentences"] = (
    articles.SentencesInArticle.apply(fix_contractions)
    .apply(lower)
    .apply(tokenize_words)
    .apply(remove_stopwords)
    .apply(clean_sentences)
)

articles["WordsInSentences"] = articles["WordsInSentences"].apply(pad_empty)
articles = articles[["SentencesInArticle", "WordsInSentences"]]
articles.head()

Unnamed: 0,SentencesInArticle,WordsInSentences
0,[Humraaz () is a 2002 Indian Hindi-language mu...,"[[humraaz, is, a, indian, hindilanguage, music..."
1,[Yehoshua Glazer (29 December 1927 - 29 Decemb...,"[[yehoshua, glazer, december, december, was, a..."
2,"[On 27 October 2018, an AgustaWestland AW169 h...","[[on, october, an, agustawestland, aw, helicop..."
3,[Randolph County is a county in the U.S. state...,"[[randolph, county, is, a, county, in, the, yo..."
4,[The Real Adventures of Jonny Quest is an Amer...,"[[the, real, adventures, of, jonny, quest, is,..."


In [4]:
VECTOR_SIZE = 50
EMPTY_VECTOR = np.zeros(VECTOR_SIZE)


def sentence_vector(sentence):
    return sum([glove_vectors.get(word, EMPTY_VECTOR) for word in sentence]) / len(
        sentence
    )


def sentences_to_vectors(sentences):
    return [sentence_vector(sentence) for sentence in sentences]


def similarity_matrix(sentence_vectors):
    sim_mat = np.zeros([len(sentence_vectors), len(sentence_vectors)])
    for i in range(len(sentence_vectors)):
        for j in range(len(sentence_vectors)):
            element_i = sentence_vectors[i].reshape(1, VECTOR_SIZE)
            element_j = sentence_vectors[j].reshape(1, VECTOR_SIZE)
            sim_mat[i][j] = cosine_similarity(element_i, element_j)[0, 0]
    return sim_mat


def compute_graph(sim_matrix):
    nx_graph = nx.from_numpy_array(sim_matrix)
    scores = nx.pagerank(nx_graph)
    return scores


articles["SentenceVector"] = articles.WordsInSentences.apply(sentences_to_vectors)
articles["SimMatrix"] = articles.SentenceVector.apply(similarity_matrix)
articles["Graph"] = articles.SimMatrix.apply(compute_graph)

In [5]:
def get_ranked_sentences(sentences, scores, n=3):
    top_scores = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    top_n_sentences = [sentence for score, sentence in top_scores[:n]]
    return " ".join(top_n_sentences)


articles["Summary"] = articles.apply(
    lambda d: get_ranked_sentences(d.SentencesInArticle, d.Graph), axis=1
)

In [8]:
for i in range(3):
    print(articles.loc[i].Summary)

The film is loosely based on the 1998 film A Perfect Murder. Humraaz () is a 2002 Indian Hindi-language musical romantic thriller film directed by the duo Abbas-Mustan. This film received positive reviews and was extremely successful at the box office.
He was born in Tel Aviv. Glazer died on his 91st birthday on 29 December 2018 in Tel Aviv. He played for Maccabi Tel Aviv and for the Israel national football team his entire career.
Club owner Vichai Srivaddhanaprabha was on board, as well as two other passengers and two pilots. On 27 October 2018, an AgustaWestland AW169 helicopter crashed shortly after take-off from the King Power Stadium, the home ground of Leicester City in Leicester, United Kingdom. The Air Accidents Investigation Branch is currently leading an investigation into the accident.


Ex2. Word Frequency Text Summarization

In [3]:
import re
import nltk
from collections import Counter
import string
import pandas as pd

nltk.download("punkt")
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/falaputin2/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/falaputin2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
articles = (
    pd.read_csv("./small_wikidump.csv")
    .sample(10, random_state=42)
    .reset_index(drop=True)
)
CLEAN_PATTERN = r"[^a-zA-z\s]"


def clean(word):
    return re.sub(CLEAN_PATTERN, "", word)


def clean_sentence(sentence):
    sentence = [clean(word) for word in sentence]
    if len(sentence) == 0:
        print("empty")
        sentence = ["PAD"]
    return [word for word in sentence if word]


def clean_sentences(sentences):
    return [clean_sentence(sentence) for sentence in sentences]


def lower(sentence):
    return [word.lower() for word in sentence]


def remove_stopwords(sentence):
    words = [word for word in sentence if word not in stop_words]
    return [word for word in words if len(word) > 0]


def tokenize_words(sentences):
    return [word_tokenize(sentence) for sentence in sentences]


def fix_contractions(sentences):
    return [contractions.fix(sentence) for sentence in sentences]


def remove_n(text):
    return text.replace("\n", " ")


def pad_empty(word_senteces):
    new_word_sentences = []
    for word_sentece in word_senteces:
        if len(word_sentece) == 0:
            new_word_sentences.append(["PAD"])
        else:
            new_word_sentences.append(word_sentece)
    return new_word_sentences


articles["text"] = articles.text.apply(remove_n)

In [16]:
def preprocess_text(text):
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha()]
    words = [word for word in words if word not in stop_words]
    return words


def word_frequencies(texts):
    all_words = []
    for text in texts:
        words = preprocess_text(text)
        all_words.extend(words)
    frequency = Counter(all_words)
    max_freq = frequency.most_common(1)[0][1]  # find most common frequence
    word_freq = {word: freq / max_freq for word, freq in frequency.items()}
    return word_freq


def filter_frequencies(word_freq, min_prop=0.1, max_prop=0.9):
    filtered_words = {
        word: freq for word, freq in word_freq.items() if min_prop <= freq <= max_prop
    }
    return filtered_words


def summarize_text(text, word_freq, top_n=1):
    sentences = sent_tokenize(text)
    sentence_scores = {}
    for sentence in sentences:
        words = preprocess_text(sentence)
        score = sum(word_freq.get(word, 0) for word in words)
        sentence_scores[sentence] = score
    top_sentences = sorted(sentence_scores, key=sentence_scores.get, reverse=True)[
        :top_n
    ]
    return top_sentences


texts = articles.loc[[9, 8, 5]]["text"]
word_freq = word_frequencies(texts)
filtered_words = filter_frequencies(word_freq, min_prop=0.1, max_prop=0.9)

In [17]:
# Top 1
for text in texts:
    summary = summarize_text(text, filtered_words, top_n=1)
    print("Original Text:", text)
    print("Summarized Text:", summary)
    print()

Original Text: The economy of the USA grew quickly in the 1920s. This growth in the 1920s had a positive effect on the USA in later years. The country's main commercial rivals had been devastated by WWI which was fought almost entirely in Europe. Germany, Britain, France, and the Low Countries were all economically weakened. Natural resources and a growing population The USA has many natural resources. For example, there is oil in Texas, coal in the Appalachian Mountains, and ranching in the Midwest and Great Plains. This meant there was no need to buy natural resources from other countries. Moreover the population of the USA was growing fast, so making a market for natural resources inside the USA. Below is a table in which the population growth in millions is shown in the 40 years after the 1920s: Below is a table which shows the urban growth in the six largest cities by population, in 1870 and 1910: The First World War During the First World War, the USA was selling metal to Britain

In [18]:
# Top 2
for text in texts:
    summary = summarize_text(text, filtered_words, top_n=2)
    print("Original Text:", text)
    print("Summarized Text:", summary)
    print()

Original Text: The economy of the USA grew quickly in the 1920s. This growth in the 1920s had a positive effect on the USA in later years. The country's main commercial rivals had been devastated by WWI which was fought almost entirely in Europe. Germany, Britain, France, and the Low Countries were all economically weakened. Natural resources and a growing population The USA has many natural resources. For example, there is oil in Texas, coal in the Appalachian Mountains, and ranching in the Midwest and Great Plains. This meant there was no need to buy natural resources from other countries. Moreover the population of the USA was growing fast, so making a market for natural resources inside the USA. Below is a table in which the population growth in millions is shown in the 40 years after the 1920s: Below is a table which shows the urban growth in the six largest cities by population, in 1870 and 1910: The First World War During the First World War, the USA was selling metal to Britain

In [20]:
# Top 3
for text in texts:
    summary = summarize_text(text, filtered_words, top_n=3)
    print("Original Text:", text)
    print("Summarized Text:", summary)
    print()

Original Text: The economy of the USA grew quickly in the 1920s. This growth in the 1920s had a positive effect on the USA in later years. The country's main commercial rivals had been devastated by WWI which was fought almost entirely in Europe. Germany, Britain, France, and the Low Countries were all economically weakened. Natural resources and a growing population The USA has many natural resources. For example, there is oil in Texas, coal in the Appalachian Mountains, and ranching in the Midwest and Great Plains. This meant there was no need to buy natural resources from other countries. Moreover the population of the USA was growing fast, so making a market for natural resources inside the USA. Below is a table in which the population growth in millions is shown in the 40 years after the 1920s: Below is a table which shows the urban growth in the six largest cities by population, in 1870 and 1910: The First World War During the First World War, the USA was selling metal to Britain