In [1]:
import pandas as pd
import numpy as np
import nltk
import string
import math
import contractions
import re

import tensorflow as tf
from datasets import load_dataset
from sklearn.model_selection import train_test_split

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset('cnn_dailymail', '3.0.0', split='train[:5%]')
dataset_test = load_dataset('cnn_dailymail', '3.0.0', split='test[0:10]')
dataset = dataset.train_test_split(test_size=0.2)

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11484
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 2872
    })
})

In [4]:
df_train, df_val = dataset['train'], dataset['test']

In [5]:
def data_preprocessing(df):
    # clean punctuation using translate method
    # create translation table first to remove punctuation that are included in string.punctuation
    translator = str.maketrans('','',string.punctuation)
    for col in df.columns:
        df[col] = df[col].astype(str)
        df[col] = df[col].apply(lambda x : x.translate(translator))
        
    # remove numbers
    translator_numbers = str.maketrans('', '', string.digits)
    for col in df.columns:
        df[col] = df[col].apply(lambda x : x.translate(translator_numbers) )
    
    # lowercase string using .lower()
    for col in df.columns:
        df[col] = df[col].apply(lambda x : x.lower())
    
    # Contractions
    for col in df.columns:
        df[col] = df[col].apply(lambda x : contractions.fix(x))
    
    # remove stopwords, must be lowercase
    # create stopwords variable that will hold our reference to all stopwords. create lambda function for 1st column to join
    # the sentence if word is not in stopwords. If it is in stopwords, the word gets ignored and not joined in final sentence
    stopwords = nltk.corpus.stopwords.words('english')
    for col in df.columns:
        df[col] = df[col].apply(lambda x : ' '.join([word for word in x.split() if word not in stopwords]))
    
    # remove special characters
    for col in df.columns:
        df[col] = df[col].apply(lambda x : re.sub(r'[^a-zA-Z0-9]', ' ', x))
    
    #tokenize sentences using nltk.word_tokenize
    for col in df.columns:
        df[col] = df[col].apply(lambda x : nltk.word_tokenize(x))
    
    return df

In [6]:
df_train_pd = pd.DataFrame(df_train)
df_train_pd = df_train_pd.drop("id", axis=1)
(df_train_pd)

Unnamed: 0,article,highlights
0,"LOS ANGELES, California (CNN) -- The Transport...",NEW: TSA backs officers who made passenger rem...
1,Washington (CNN) -- The inspector general at t...,NEW: Airports must provide more worker informa...
2,(CNN) -- The Ivory Coast have reportedly sacke...,Ivory Coast have reportedly sacked coach Vahid...
3,(CNN) -- Israeli coach Avram Grant has been ap...,Avram Grant in the new manager of West Ham Uni...
4,(CNN) -- A British man convicted of having sex...,"Michelle Palmer and Vincent Acors, faced three..."
...,...,...
11479,NEW YORK (CNN) -- Legendary football coach and...,Legendary football coach and broadcaster John ...
11480,(CNN) -- The oil spill on the Gulf Coast has s...,"All of Florida's beaches are open, including t..."
11481,"Atlanta, Georgia (CNN) -- Walking through the ...","""Akira"" ushered in a new wave of sophisticated..."
11482,New York (CNN) -- Today's bipartisan health ca...,John Avlon says health care summit comes amid ...


In [28]:
print(df_train_pd.loc[[0]])

                                             article  \
0  [los, angeles, california, cnn, transportation...   

                                          highlights  
0  [new, tsa, backs, officers, made, passenger, r...  


In [26]:
print(df_train_pd.loc[[1]])

                                             article  \
1  [washington, cnn, inspector, general, departme...   

                                          highlights  
1  [new, airports, must, provide, worker, informa...  


In [7]:
df_train_pre = data_preprocessing(df_train_pd)

In [8]:
df_train_pre

Unnamed: 0,article,highlights
0,"[los, angeles, california, cnn, transportation...","[new, tsa, backs, officers, made, passenger, r..."
1,"[washington, cnn, inspector, general, departme...","[new, airports, must, provide, worker, informa..."
2,"[cnn, ivory, coast, reportedly, sacked, coach,...","[ivory, coast, reportedly, sacked, coach, vahi..."
3,"[cnn, israeli, coach, avram, grant, appointed,...","[avram, grant, new, manager, west, ham, united..."
4,"[cnn, british, man, convicted, sex, dubai, bea...","[michelle, palmer, vincent, acors, faced, thre..."
...,...,...
11479,"[new, york, cnn, legendary, football, coach, b...","[legendary, football, coach, broadcaster, john..."
11480,"[cnn, oil, spill, gulf, coast, states, visitor...","[floridas, beaches, open, including, tourist, ..."
11481,"[atlanta, georgia, cnn, walking, downtown, hot...","[akira, ushered, new, wave, sophisticated, div..."
11482,"[new, york, cnn, todays, bipartisan, health, c...","[john, avlon, says, health, care, summit, come..."


In [9]:
# https://medium.com/analytics-vidhya/simple-text-summarization-using-nltk-eedc36ebaaf8
# https://www.freecodecamp.org/news/what-is-rouge-and-how-it-works-for-evaluation-of-summaries-e059fb8ac840/

In [10]:
def word_frequencies(df, max_freq):
    word_freq = {}
    for review in df.iloc[:,1]:
        for word in review:
            if word not in word_freq.keys():
                word_freq[word] = 1
            else:
                word_freq[word] += 1
    max_freq = max(word_freq.values())
    return word_freq, max_freq

In [11]:
word_freq_raw = {}
word_freq_raw, max_freq = word_frequencies(df_train_pre, 0)

In [12]:
word_freq = {}
for word in word_freq_raw.keys():
    word_freq[word] = word_freq_raw[word]/max_freq

In [13]:
word_freq

{'new': 0.5364722668093455,
 'tsa': 0.004637060816836098,
 'backs': 0.003031924380238987,
 'officers': 0.02800071339397182,
 'made': 0.0629570180131978,
 'passenger': 0.0098091671125379,
 'remove': 0.002675227394328518,
 'nipple': 0.0003566969859104691,
 'rings': 0.0019618334225075798,
 'agency': 0.02603887997146424,
 'acknowledges': 0.0032102728731942215,
 'procedures': 0.004815409309791332,
 'need': 0.02800071339397182,
 'changed': 0.007668985197075085,
 'agent': 0.011949349028000713,
 'using': 0.023363652577135723,
 'handheld': 0.0008917424647761727,
 'metal': 0.0019618334225075798,
 'detector': 0.00017834849295523454,
 'lubbock': 0.0003566969859104691,
 'airport': 0.021401819154628143,
 'found': 0.10647405029427501,
 'piercings': 0.00017834849295523454,
 'woman': 0.05885500267522739,
 'says': 1.0,
 'heard': 0.010700909577314071,
 'male': 0.006598894239343678,
 'agents': 0.010522561084358837,
 'snicker': 0.00017834849295523454,
 'removed': 0.0071339397182093815,
 'airports': 0.00659

In [14]:
# apply dictionary to original dataset X_train and compare it to X_test

# Create a DataFrame from the collected data
df_new = pd.DataFrame(df_train)

In [15]:
for col in df_new.columns:
    df_new[col] = df_new[col].astype(str)
    df_new[col] = df_new[col].apply(lambda x : x.lower())
nltk.download('punkt')
df_new["sentences"] = df_new["article"].apply(nltk.sent_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Gurpreet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
import heapq

def summarize_row(row):
    sent_score = {}
    for sentence in row["sentences"]:
        for word in sentence:
            if word in word_freq.keys():
                if (len(sentence.split(' ')) > 2 and len(sentence.split(' ')) < 40):
                    # do not want the one word sentences like :"by" or "Mr" 
                    # can also add < 30-40 since larger sentences probably have higher score
                    if sentence not in sent_score.keys():
                        sent_score[sentence] = word_freq[word]
                    else:
                        sent_score[sentence] += word_freq[word]
    summary_sentences = heapq.nlargest(5, sent_score, key=sent_score.get)
    return ' '.join(summary_sentences)

df_new["Summarized"] = df_new.apply(summarize_row, axis=1)

In [19]:
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

def rouge_score(row, scorer):
    reference_summary = row[1]
    generated_summary = row["Summarized"]
    scores = scorer.score(reference_summary, generated_summary)
    return scores['rouge1'].fmeasure, scores['rouge2'].fmeasure, scores['rougeL'].fmeasure

df_new["ROUGE1Scores"], df_new["ROUGE2Scores"], df_new["ROUGELScores"] = zip(*df_new.apply(lambda row: rouge_score(row, scorer), axis=1))


  reference_summary = row[1]


In [20]:
ROUGE1_AVG = df_new["ROUGE1Scores"].sum() / df_new["ROUGE1Scores"].count()
ROUGE2_AVG = df_new["ROUGE2Scores"].sum() / df_new["ROUGE2Scores"].count()
ROUGEL_AVG = df_new["ROUGELScores"].sum() / df_new["ROUGELScores"].count()
print("ROUGE1AVG: {:.3f}, ROUGE2AVG: {:.3f}, ROUGELAVG: {:.3f}".format(ROUGE1_AVG, ROUGE2_AVG, ROUGEL_AVG)) 

ROUGE1AVG: 0.246, ROUGE2AVG: 0.085, ROUGELAVG: 0.148
