In [1]:
import numpy as np
import pandas as pd
import warnings
import re
import nltk
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
import string
from string import punctuation
from nltk.corpus import stopwords
from heapq import nlargest
from typing import List,Dict
stop_words = set(stopwords.words('english'))
punctuation = punctuation + '\n' + '—' + '“' + ',' + '”' + '‘' + '-' + '’'
warnings.filterwarnings('ignore')

## Load the data and get the articles 

In [138]:
df = pd.read_csv("./news.csv")
df.dropna(axis=0,inplace=True)
articles = df["content"]
articles.head()

0    After reaching his hotel in the city, RM revea...
1    RM aka Kim Namjoon was the first member to joi...
2    Billie Eilish's concert was held in Seoul, Sou...
3    BTS ARMY y'all would be missing the members a ...
4    BTS member Kim Seokjin aka Jin has the capacit...
Name: content, dtype: object

## Preprocess the articles by removing unnecessary words and punctuation

In [93]:
def prepocessing(articles):
    """
    This function takes in a pandas Series of articles as input and performs several preprocessing steps to clean and prepare the articles for further analysis.
    The steps include:
    
     1.   Converting all the articles to lowercase.
     2.   Removing URLs from the articles using regular expression.
     3.   Saving a copy of the original articles for sentence tokenization.
     4.   Removing trailing whitespaces from the articles.
     5.   Removing punctuations from the articles.
     6.  Removing stopwords from the articles.
    
    The function returns a pandas Series of preprocessed articles.
    
    Parameters:
    articles : pandas Series
    A series of articles in string format
    
    Returns:
    pandas Series
    A series of preprocessed articles
    """
    global article_original
    
    #Converting all the artical in lower case
    articles = articles.str.lower()
    
    # remove url
    url_pattern = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
    articles = articles.apply(lambda x : re.sub(url_pattern,"" ,x))
    
    articles = articles.apply(lambda x : re.sub(r'\n+',". ",x))
    
    # Removing the '\xa0'
    articles = articles.apply(lambda x : re.sub(r'\xa0',' ', x))
    
    #saving articles (pandas Series) for sentence tokenization
    article_original = articles.copy()
    
    #removing trailing whitespaces
    articles = articles.apply(lambda x : re.sub(r' +',' ', x))
    
    
    #removing punctuations
    articles = articles.apply(lambda x : ''.join(char for char in x if char not in punctuation))
    
    #removing trailing whitespaces again
    articles = articles.apply(lambda x : re.sub(r' +',' ', x))
    
    #removing the stopword
    articles = articles.apply(lambda x : ' '.join(word for word in x.split() if word not in stop_words))
    
    return articles


## Creating word frequency for every article

In [94]:
def normalize_frequency(article_li: List[Dict[str, float]]) -> List[Dict[str, float]]:
    """
    Parameters:
    article_li (List[Dict[str, int]]): list of dictionaries, where each dictionary contains a word and its corresponding frequency in an article.
    
    Returns:
    article_li (List[Dict[str, float]]): list of dictionaries, where each dictionary contains a word and its corresponding normalized frequency in an article.
    
    This function takes in a list of dictionaries where each dictionary contains a word and its corresponding frequency in an article. 
    The function then iterates over each dictionary, finds the maximum frequency, 
    and normalizes all the other frequencies by dividing them by the maximum frequency.
    The function returns the list of normalized dictionaries.
    """
    for word_freq in article_li:
        maxi = max(word_freq.values())
        for key in word_freq:
            word_freq[key] = word_freq[key]/maxi
    return article_li

def word_frequency(articles: pd.Series) -> List[Dict[str, float]]:
    """
    Parameters:
    articles (pd.Series): Pandas series of articles.
    
    Returns:
    norm_freq (List[Dict[str, float]]): list of dictionaries, where each dictionary contains a word and its corresponding normalized frequency in an article.
    
    This function takes in a list of articles, tokenizes the words in each article, 
    creates a dictionary for each article containing the words as keys and their frequencies as values, 
    and appends these dictionaries to the article_li list. 
    The function then calls the normalize_frequency function to normalize the word frequencies and returns the list of normalized dictionaries.
    
    """
    article_li = []
    for article in articles:
        word_token = word_tokenize(article)
        word_freq ={}
        for word in word_token:
            word_freq[word] = word_freq.get(word,0)+1
        article_li.append(word_freq)
    norm_freq = normalize_frequency(article_li)
    return norm_freq

    

## Score the sentences in article based on mormalized word frequency for every article

In [95]:
def sentence_score(norm_freq: List[Dict[str,int]]) -> List[Dict[str,int]]:
    """
    Parameter:
    norm_freq : list of dict
    each dict contains word as key and their normalized frequency as value
    Result:
    score_li : list of dict
    each dict contains sentence as key and their score as value

    This function takes a list of normalized word frequencies and iterates over original articles and their corresponding word frequencies. 
    It tokenizes sentences and scores them based on the frequency of words present in them, using the provided normalized word frequencies. 
    The scored sentences are then returned as a list of dictionaries, where the key is the sentence and the value is the score.
    """
    score_li = []
    for article,word_freq in zip(article_original,norm_freq):
        sentence_token = sent_tokenize(article)
        article_score = {}
        
        # clean all the sentences again
        for sentence in sentence_token:
            token = "".join(char for char in sentence if char not in punctuation)
            token = re.sub(r' +',' ',token)
            #break all the token in word for scoring sentences
            for word in word_tokenize(token):
                article_score[sentence] = article_score.get(sentence,0) + word_freq.get(word,0)
        score_li.append(article_score)
    return score_li

## summarize the article based on sentence score for each article

In [146]:
def summary(score_li : List[Dict[str,int]]) -> List[str]:
    '''
    Parameters:
    score_li (List[Dict]): List of dictionaries, where each dictionary contains a sentence and its corresponding score.
    
    Returns:
    List[str]: List of summaries where each summary is a string
    
    This function takes in a list of dictionaries, where each dictionary contains a sentence and its corresponding score. 
    The function then iterates over each dictionary, calculates the summary length (25% of the total sentences), 
    finds the top sentences (based on their scores) using the nlargest() function, and 
    joins them together using a white space. 
    The final summary of each article is added to the summary_li list and the function returns this list.
    '''
    summary_li = []  #initialize an empty list to store summary sentences
    for article_score in score_li:
        
        #calculating the length of summary, assuming 25% of original text as summary
        summuary_length = 1 if int(len(article_score) *0.25) <= 1 else int(len(article_score) *0.25) 

        #getting top sentences from the article_score dictionary
        top_sentences = nlargest(summuary_length, article_score, key = article_score.get) 
        
        #joining the top sentences with a dot and append to summary_li
        summary_li.append(' '.join(top_sentences)) 
    
    return summary_li  #returning the final summary_li


In [130]:
def summarize(articles):
    
    #clean the article 
    articles = prepocessing(articles)
    
    #create normalized word_frequency
    norm_frequency = word_frequency(articles)
    
    #create score for every article
    score_li = sentence_score(norm_freq=norm_frequency)
    
    #create list of summary for every article
    summary_li = summary(score_li)
    
    return summary_li

## Spliting test data

In [145]:
def test_train_split(articles,size=0.1):
    size = int(size*articles.shape[0])
    np.random.seed(42)
    idx = np.random.choice(articles.index,size=size)
    return articles[idx]
test_data = test_train_split(df["content"])

In [154]:
summary_li = summarize(test_data)
final = pd.DataFrame({"Original Content":test_data,"New Content":summary_li})
final.head()

Unnamed: 0,Original Content,New Content
102,"BTS members - Taehyung, Suga, RM, Jin, J-Hope,...",there are also rumours of taehyung and suga fl...
439,BTS member J-Hope unveiled his personalised me...,bts member j-hope unveiled his personalised me...
273,"In the over 12 minute-long video, several old ...","""we all thought workout with bts would be bts ..."
106,BTS ARMY is one helluva massive ARMY with a wr...,and we came across some bts armys going crazy ...
71,"The teaser, titled ""Run BTS! 2022 Special Epis...","2022 special episode - telepathy part 0,"" star..."


## finding which sentences are removed from summary

In [155]:
def sent_remove():
    remove_list = []
    for article,summary in zip(final["Original Content"],final["New Content"]):
        art_sent  = set(sent.lower() for sent in sent_tokenize(article))
        summ_sent = set(sent_tokenize(summary))
        art_sent.difference_update(summ_sent)   
        remove_list.append(" ".join(art_sent))
    return remove_list
final["Removed Lines"] =sent_remove()

In [156]:
final.head()

Unnamed: 0,Original Content,New Content,Removed Lines
102,"BTS members - Taehyung, Suga, RM, Jin, J-Hope,...",there are also rumours of taehyung and suga fl...,both suga and taehyung are seen feeding each o...
439,BTS member J-Hope unveiled his personalised me...,bts member j-hope unveiled his personalised me...,"""there’s never a dull moment with vmin,"" said ..."
273,"In the over 12 minute-long video, several old ...","""we all thought workout with bts would be bts ...",bts recently wrapped their four concerts in la...
106,BTS ARMY is one helluva massive ARMY with a wr...,and we came across some bts armys going crazy ...,"in mid-october, big hit music announced that t..."
71,"The teaser, titled ""Run BTS! 2022 Special Epis...","2022 special episode - telepathy part 0,"" star...","j-hope added to this, ""we've been together for..."


In [151]:
from rouge import Rouge
import sys
sys.setrecursionlimit(10000)

rouge = Rouge()

def metric(row):
    scores = rouge.get_scores(row["New Content"], row["Original Content"])
    return scores[0]["rouge-1"]["f"]


ROUGE-1:  0.42424242001836554
ROUGE-2:  0.1621621584806429
ROUGE-L:  0.42424242001836554


In [157]:
final["ROUGE-1"] = final.apply(metric,axis=1)

In [162]:
final.rename_axis("Index", axis=0, inplace=True)
final.head()

Unnamed: 0_level_0,Original Content,New Content,Removed Lines,ROUGE-1
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
102,"BTS members - Taehyung, Suga, RM, Jin, J-Hope,...",there are also rumours of taehyung and suga fl...,both suga and taehyung are seen feeding each o...,0.448378
439,BTS member J-Hope unveiled his personalised me...,bts member j-hope unveiled his personalised me...,"""there’s never a dull moment with vmin,"" said ...",0.784029
273,"In the over 12 minute-long video, several old ...","""we all thought workout with bts would be bts ...",bts recently wrapped their four concerts in la...,0.505576
106,BTS ARMY is one helluva massive ARMY with a wr...,and we came across some bts armys going crazy ...,"in mid-october, big hit music announced that t...",0.522822
71,"The teaser, titled ""Run BTS! 2022 Special Epis...","2022 special episode - telepathy part 0,"" star...","j-hope added to this, ""we've been together for...",0.549828


In [163]:
final.to_csv("./result.csv")