# Python NLP Text Model for Extracting Key Sentences Using TF-IDF Algorithm

### Importing necessary libraries

In [244]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.translate.bleu_score import sentence_bleu

In [245]:
# Downloading necessary resources for NLTK
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Dataset Reading 

In [246]:
# Reading the dataset into df dataframe
df = pd.read_csv('news.csv')
df.head()

Unnamed: 0,title,content,published_at,source,topic
0,BTS: RM is reminded of Bon Voyage as he travel...,"After reaching his hotel in the city, RM revea...",2022-07-30T07:00:00Z,2,13
1,RM recalls wondering if he 'made right decisio...,RM aka Kim Namjoon was the first member to joi...,2022-12-22T15:57:55Z,2,13
2,BTS: J-Hope and RM go bonkers at Billie Eilish...,"Billie Eilish's concert was held in Seoul, Sou...",2022-08-16T07:00:00Z,1,7
3,"BTS: J-Hope proudly states he raised Jungkook,...",BTS ARMY y'all would be missing the members a ...,2022-12-18T13:08:40Z,1,7
4,BTS: Jin aka Kim Seokjin takes us through the ...,BTS member Kim Seokjin aka Jin has the capacit...,2022-11-21T08:00:00Z,1,8


### Data Cleaning

In [247]:
# Dropping any rows with empty content
df.dropna(subset=['content'], inplace=True)

# Dropping duplicate entries
df.drop_duplicates(subset=['content'], inplace=True)

# Resetting index of the dataframe
df.reset_index(inplace=True, drop=True)

df.head()

Unnamed: 0,title,content,published_at,source,topic
0,BTS: RM is reminded of Bon Voyage as he travel...,"After reaching his hotel in the city, RM revea...",2022-07-30T07:00:00Z,2,13
1,RM recalls wondering if he 'made right decisio...,RM aka Kim Namjoon was the first member to joi...,2022-12-22T15:57:55Z,2,13
2,BTS: J-Hope and RM go bonkers at Billie Eilish...,"Billie Eilish's concert was held in Seoul, Sou...",2022-08-16T07:00:00Z,1,7
3,"BTS: J-Hope proudly states he raised Jungkook,...",BTS ARMY y'all would be missing the members a ...,2022-12-18T13:08:40Z,1,7
4,BTS: Jin aka Kim Seokjin takes us through the ...,BTS member Kim Seokjin aka Jin has the capacit...,2022-11-21T08:00:00Z,1,8


### Info about news.csv dataset

In [248]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 806 entries, 0 to 805
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   title         806 non-null    object
 1   content       806 non-null    object
 2   published_at  806 non-null    object
 3   source        806 non-null    int64 
 4   topic         806 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 31.6+ KB


In [249]:
df.shape

(806, 5)

### Data Preprocessing

In [250]:
# Creating a set of stop words to remove from the text
stop_words = set(stopwords.words('english'))

# Creating instances of the WordNetLemmatizer and PorterStemmer() for lemmatization and stemming
lemmatizer = WordNetLemmatizer()
stemmer=PorterStemmer()

# Function to preprocess text
def preprocess_sent(sent):
    sent = re.sub('[^a-zA-Z]',' ',sent) # Removing special characters and digits

    sent = re.sub('\s+', ' ', sent).strip() # Removing extra whitespace

    sent = sent.lower() # Converting to lowercase

    # Tokenizing the text into words
    words = word_tokenize(sent)

    # Removing stop words from the text
    words = [word for word in words if word not in stop_words]

    # Stemming
    words = [stemmer.stem(word) for word in words]

    # Lemmatization
    words = [lemmatizer.lemmatize(word) for word in words]

    sent = ' '.join(words)
    
    return sent

# Applying the preprocessing function to the 'content' column of the dataframe
df['cleaned_content'] = df['content'].apply(preprocess_sent)

# Displaying preprocessed dataframe
df.head()


Unnamed: 0,title,content,published_at,source,topic,cleaned_content
0,BTS: RM is reminded of Bon Voyage as he travel...,"After reaching his hotel in the city, RM revea...",2022-07-30T07:00:00Z,2,13,reach hotel citi rm reveal stay would four day...
1,RM recalls wondering if he 'made right decisio...,RM aka Kim Namjoon was the first member to joi...,2022-12-22T15:57:55Z,2,13,rm aka kim namjoon first member join bt group ...
2,BTS: J-Hope and RM go bonkers at Billie Eilish...,"Billie Eilish's concert was held in Seoul, Sou...",2022-08-16T07:00:00Z,1,7,billi eilish concert held seoul south korea at...
3,"BTS: J-Hope proudly states he raised Jungkook,...",BTS ARMY y'all would be missing the members a ...,2022-12-18T13:08:40Z,1,7,bt armi would miss member lot right well one b...
4,BTS: Jin aka Kim Seokjin takes us through the ...,BTS member Kim Seokjin aka Jin has the capacit...,2022-11-21T08:00:00Z,1,8,bt member kim seokjin aka jin capac creat dive...


### Training and testing model distribution of dataset

### (test-train split of 10-90)

In [251]:
from sklearn.model_selection import train_test_split

# Spliting the data into train and test sets
# test_size=0.1 (10), train_size=0.9 (90)
train_data, test_data = train_test_split(df, test_size=0.1, random_state=23)

print('Number of rows in the test set:', len(test_data))
print('Number of rows in the train set:', len(train_data))

Number of rows in the test set: 81
Number of rows in the train set: 725


### Test Data

In [252]:
test_data.head()

Unnamed: 0,title,content,published_at,source,topic,cleaned_content
419,"The Godfather turns 50, The Offer trailer impr...",The world of Hollywood has been very exciting ...,2022-03-24T07:00:00Z,1,7,world hollywood excit today bring wrap trend h...
203,"BTS' V, his Wooga Squad friends take walk on b...",The video started with a view of the sea durin...,2022-07-09T07:00:00Z,2,14,video start view sea winter second long clip h...
478,BTS' Jungkook lashes out at a troll who asked ...,BTS Jungkook gave a hater the taste of his own...,2022-03-31T07:00:00Z,1,10,bt jungkook gave hater tast medicin onlin user...
112,BTS: Jungkook makes STAGGERING record on Spoti...,BTS' Golden Maknae Jungkook has has quite the ...,2022-12-29T10:06:31Z,1,1,bt golden makna jungkook quit year solo stay a...
576,NCT 127 announces repackaged album Ay-Yo: Here...,SM Entertainment recently confirmed the comeba...,2023-01-06T14:06:59Z,3,9,sm entertain recent confirm comeback date nct ...


### Train Data

In [253]:
train_data.head()

Unnamed: 0,title,content,published_at,source,topic,cleaned_content
359,BTS features on KBC 14: Amitabh Bachchan asks ...,"The options were--A. South Korea, B. Iran, C. ...",2022-10-13T07:00:00Z,2,11,option south korea b iran c sri lanka mongolia...
375,"BTS’ Jungkook, ASTRO’s Cha Eun Woo, GOT7’s Yug...",When it comes to becoming an avid listener of ...,2022-06-20T07:00:00Z,3,16,come becom avid listen k pop term inadvert end...
552,"Suchwita Ep 2 Highlights: BTS’ SUGA, Shin Dong...",BTS member SUGA has returned to our screens wi...,2023-01-05T17:28:13Z,3,11,bt member suga return screen fun insight chat ...
226,"BTS Proof Live: Snoop Dogg, Charlie Puth, HER,...",BTS' fame is such that there is no doubt that ...,2022-06-07T07:00:00Z,1,1,bt fame doubt k pop band far one popular music...
676,Pathaan meets BTS: Shah Rukh Khan-Deepika Padu...,BTS and their choreographies are like sorcery....,2023-01-09T11:31:02Z,1,1,bt choreographi like sorceri fit everi bollywo...


### List of indexes of train data

In [254]:
# Making list of indexes of train data
tdlist=list(train_data.index.values)
print(tdlist)

[359, 375, 552, 226, 676, 463, 725, 229, 522, 151, 270, 204, 404, 759, 589, 50, 562, 21, 597, 581, 300, 263, 25, 506, 344, 554, 346, 449, 261, 390, 765, 651, 63, 457, 79, 586, 357, 65, 234, 604, 668, 681, 364, 677, 746, 99, 45, 727, 366, 330, 755, 475, 734, 428, 379, 545, 652, 251, 127, 448, 118, 66, 343, 670, 174, 264, 420, 674, 721, 493, 396, 415, 91, 114, 569, 146, 213, 184, 412, 275, 166, 688, 394, 96, 155, 73, 563, 631, 500, 491, 431, 433, 730, 86, 140, 53, 188, 530, 51, 508, 572, 559, 584, 130, 666, 314, 673, 387, 301, 14, 542, 381, 208, 802, 273, 611, 427, 176, 293, 778, 743, 435, 131, 502, 489, 771, 793, 148, 152, 124, 64, 58, 775, 133, 260, 373, 659, 398, 319, 372, 48, 633, 682, 536, 752, 115, 209, 701, 769, 202, 305, 621, 90, 655, 794, 481, 568, 525, 335, 424, 556, 307, 30, 135, 341, 707, 183, 236, 165, 280, 766, 271, 787, 656, 598, 362, 639, 411, 302, 689, 782, 512, 13, 385, 799, 159, 349, 649, 409, 620, 313, 222, 729, 780, 624, 46, 164, 436, 326, 675, 378, 16, 704, 474, 89,

### Vectorization using TF-IDF (Term Frequency-Inverse Document Frequency)

In [255]:
# Importing TfidfVectorizer from sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating instance of TfidfVectorizer class
vectorizer = TfidfVectorizer()

# Vectorizing cleaned content of train data using fit_transform()
train_tfidf = vectorizer.fit_transform(train_data['cleaned_content'])

### Ranking of Sentences in Train Data using TF-IDF scores

In [256]:
from collections import defaultdict
top_n=5    # Top 5 sentences

# Ranking the sentences in each news article based on their TF-IDF scores
train_tfidf_ranked_indices = defaultdict(list)

# Traversing through each news article in train data
for i in range(train_tfidf.shape[0]):

    # Retrieving the non-zero feature indices (i.e., the indices of the words that appear in the article) from the corresponding row of the train_tfidf matrix using the nonzero() method. 
    feature_index = train_tfidf[i,:].nonzero()[1]

    # Creating a list of tuples where the first element of each tuple is a feature index and the second element is the corresponding TF-IDF score for that feature index in the current article.
    # Using zip()
    tfidf_scores = zip(feature_index, [train_tfidf[i, x] for x in feature_index])

    # Sorting tfidf_scores list in decreasing order
    sorted_tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

    # Retrieving the first element of first n tuples in sorted list
    top_n_indices = [sorted_tfidf_scores[i][0] for i in range(top_n)]

    # Adding to train_tfidf_ranked_indices
    train_tfidf_ranked_indices[i] = top_n_indices
    
print("train tfidf ranked indices= ")
train_tfidf_ranked_indices

train tfidf ranked indices= 


defaultdict(list,
            {0: [322, 1630, 5561, 3906, 5515],
             1: [4650, 5942, 7803, 3821, 8056],
             2: [6626, 4742, 8830, 7541, 2252],
             3: [3326, 1019, 7301, 2730, 2208],
             4: [5722, 876, 725, 6277, 6637],
             5: [3910, 1485, 5258, 6008, 4381],
             6: [1763, 4182, 3664, 8515, 6650],
             7: [4381, 5838, 4119, 6246, 7678],
             8: [6210, 1515, 806, 6823, 6459],
             9: [2978, 604, 2930, 6886, 931],
             10: [8663, 2649, 2867, 7854, 6618],
             11: [4691, 6740, 833, 6672, 2836],
             12: [4692, 6111, 1063, 1019, 4906],
             13: [8418, 3664, 2938, 4809, 1448],
             14: [7086, 4201, 1093, 8714, 6561],
             15: [790, 1112, 6059, 3152, 4165],
             16: [7678, 5844, 4999, 1697, 5531],
             17: [4264, 3428, 790, 7291, 3664],
             18: [2892, 5181, 4575, 1662, 193],
             19: [4505, 4129, 2516, 3904, 5088],
             20: [769,

### Function to get the most important sentences in a news article based on their TF-IDF scores

In [257]:
# Function to get the most important sentences in a news article based on their TF-IDF scores
def get_sentences_by_indices(text, indices):
    sentences = text.split('. ')
    selected_sentences = []
    for j in indices:
        if j < len(sentences):
            selected_sentences.append(sentences[j])
    return '. '.join(selected_sentences)

### Generating summary of train data

In [258]:
# Generating summary of most important sentences from each news article based on their TF-IDF scores
train_summary = []

# Iterating over each document in the training data
for i in range(train_data.shape[0]):

    # Checking if the document index is in the list of indices with the highest TF-IDF scores
    if i in train_tfidf_ranked_indices:
        summary_indices = train_tfidf_ranked_indices[i]
        if(i in tdlist):
          summary = get_sentences_by_indices(train_data['content'][i], summary_indices)
          train_summary.append(summary)
    else:
        train_summary.append('')

### Test Data

In [259]:
test_data

Unnamed: 0,title,content,published_at,source,topic,cleaned_content
419,"The Godfather turns 50, The Offer trailer impr...",The world of Hollywood has been very exciting ...,2022-03-24T07:00:00Z,1,7,world hollywood excit today bring wrap trend h...
203,"BTS' V, his Wooga Squad friends take walk on b...",The video started with a view of the sea durin...,2022-07-09T07:00:00Z,2,14,video start view sea winter second long clip h...
478,BTS' Jungkook lashes out at a troll who asked ...,BTS Jungkook gave a hater the taste of his own...,2022-03-31T07:00:00Z,1,10,bt jungkook gave hater tast medicin onlin user...
112,BTS: Jungkook makes STAGGERING record on Spoti...,BTS' Golden Maknae Jungkook has has quite the ...,2022-12-29T10:06:31Z,1,1,bt golden makna jungkook quit year solo stay a...
576,NCT 127 announces repackaged album Ay-Yo: Here...,SM Entertainment recently confirmed the comeba...,2023-01-06T14:06:59Z,3,9,sm entertain recent confirm comeback date nct ...
...,...,...,...,...,...,...
526,Krackdown: Bicycle by BTS’ RM on World Bicycle...,BTS’ RM has time and again shown his love for ...,2022-06-03T07:00:00Z,3,13,bt rm time shown love ride bicycl much went ah...
107,BTS: SUGA aka Min Yoongi reveals how RM reacte...,BTS members RM aka Kim Namjoon and SUGA had th...,2022-12-05T08:00:00Z,1,11,bt member rm aka kim namjoon suga telecast suc...
442,BTS x KGF 2 fan combines the Bangtan Boys with...,BTS fans in India are next level pros when it ...,2022-04-19T07:00:00Z,1,1,bt fan india next level pro come edit yash kgf...
147,BTS' V reacts as Choi Woo-shik asks 'how do yo...,The new clip started with Peakboy brewing coff...,2022-07-26T07:00:00Z,2,14,new clip start peakboy brew coffe everyon earl...


### List of indexes of test data

In [260]:
tslist=list(test_data.index.values)
print(tslist)

[419, 203, 478, 112, 576, 541, 503, 228, 138, 268, 201, 692, 117, 154, 134, 517, 615, 392, 423, 410, 161, 763, 207, 157, 276, 197, 696, 564, 408, 257, 583, 248, 74, 434, 454, 253, 158, 324, 284, 661, 422, 640, 95, 430, 309, 19, 371, 24, 334, 196, 690, 748, 671, 195, 167, 521, 497, 663, 178, 710, 461, 368, 139, 211, 59, 22, 26, 654, 29, 143, 401, 93, 580, 680, 738, 153, 526, 107, 442, 147, 329]


### Vectorization of Test Data using TF-IDF

In [261]:
tfidf_vectorizer = TfidfVectorizer(min_df=1,max_df=5)

# Vectorizing cleaned content of test data using fit_transform()
test_tfidf = vectorizer.fit_transform(test_data['cleaned_content'])

### Generation of new content in Test Data 
### (Removal of insignificant words)

In [262]:
# Creating final result dataframe
# Columns would be Original Content, New Content, Removed Lines, BLEU Score and Cosine Similarity
result = pd.DataFrame(columns=['Original Content', 'New Content', 'Removed Lines','BLEU Score', 'Cosine Similarity'])

# Iterating through test data
for i in tslist:
  original_content = test_data['content'][i]
  copy_original=original_content

  # Lowercase all words
  original_content = original_content.lower()

  # Tokenizing words
  words = word_tokenize(original_content)

  # Removing stopwords
  stop_words = set(stopwords.words('english'))
  filtered_words = [word for word in words if not word in stop_words]

  # Rejoining filtered words into sentences
  filtered_sentences = []
  for sentence in sent_tokenize(original_content):
      sentence_words = word_tokenize(sentence.lower())
      filtered_sentence_words = [word for word in sentence_words if not word in stop_words]
      filtered_sentence = ' '.join(filtered_sentence_words)
      filtered_sentences.append(filtered_sentence)

  # Using TfidfVectorizer to get important words and scores
  if len(filtered_sentences) > 1:
      tfidf_vectorizer = TfidfVectorizer(use_idf=True)
      tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(filtered_sentences)

      # Getting sum of tf-idf scores for each sentence
      sums = tfidf_vectorizer_vectors.sum(axis=1)

      # Getting the sentence with the highest score
      max_score = max(sums)
      max_score_index = sums.tolist().index(max_score)

      # Geting the top 10 sentences with highest score
      ranked_sentences = sorted(((sums[i],s) for i,s in enumerate(filtered_sentences)), reverse=True)
      top_sentences = [ranked_sentences[i][1] for i in range(min(10, len(ranked_sentences)))]

      # Joining the top sentences to create the new content
      new_content = " ".join(top_sentences)
  else:
      new_content = filtered_sentences[0]

  # print("\nOriginal Content:\n", original_content)
  # print("\nNew Content:\n", new_content)

  # Removed lines
  removed_lines = original_content.replace(new_content, '')

  result.at[i,'Original Content']= copy_original
  result.at[i,'New Content']= new_content
  result.at[i,'Removed Lines']= removed_lines

## Further Metrics

### BLEU Score Calculation

In [263]:
# Importing sentence_bleu from NLTK
from nltk.translate.bleu_score import sentence_bleu

# BLEU Score calculation
for i in tslist:
  reference = [test_data['content'][i].split()]
  candidate = result['New Content'][i].split()
  weights = (1.0/1.0,)
  bleu_score = sentence_bleu(reference, candidate, weights)
  result.at[i,'BLEU Score']= bleu_score

### Cosine Similarity Calculation using TF-IDF

In [264]:
# Cosine Similarity calculation
for i in tslist:
  tfidf_vectorizer = TfidfVectorizer()
  tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform([test_data['content'][i], result['New Content'][i]])
  cosine_similarity = ((tfidf_vectorizer_vectors * tfidf_vectorizer_vectors.T).A)[0,1]
  result.at[i,'Cosine Similarity']= cosine_similarity

### Result Dataframe

In [265]:
result.head()

Unnamed: 0,Original Content,New Content,Removed Lines,BLEU Score,Cosine Similarity
419,The world of Hollywood has been very exciting ...,j-hope aka jung hoseok tests positive covid-19...,the world of hollywood has been very exciting ...,0.270011,0.349926
203,The video started with a view of the sea durin...,"30-second long clip , hyung-sik seen running t...",the video started with a view of the sea durin...,0.136006,0.263688
478,BTS Jungkook gave a hater the taste of his own...,"currently quarantine us recently , k-pop super...",bts jungkook gave a hater the taste of his own...,0.235473,0.404352
112,BTS' Golden Maknae Jungkook has has quite the ...,"well , army cloud nine jungkook two songs top ...",bts' golden maknae jungkook has has quite the ...,0.178008,0.411889
576,SM Entertainment recently confirmed the comeba...,sm entertainment recently confirmed comeback d...,sm entertainment recently confirmed the comeba...,0.404279,0.456806
