## Import Libs

In [1]:
from nltk.tokenize import sent_tokenize
import nltk
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.corpus import stopwords
import re

nltk.download('punkt')


df = pd.read_csv(r"Z:\Projects\NLP\ Txt_Summarization\cnn_dailymail\train.csv")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PrideGod\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
df.head(5)
print(df['highlights'])

0         Bishop John Folda, of North Dakota, is taking ...
1         Criminal complaint: Cop used his role to help ...
2         Craig Eccleston-Todd, 27, had drunk at least t...
3         Nina dos Santos says Europe must be ready to a...
4         Fleetwood top of League One after 2-0 win at S...
                                ...                        
287108    Chelsea Clinton said question of running for o...
287109    Vanilla Ice, 47 - real name Robert Van Winkle ...
287110    America's most lethal sniper made comment in i...
287111    A swarm of more than one million has crossed b...
287112    Other 2016 hopefuls maintain that Bush's annou...
Name: highlights, Length: 287113, dtype: object


## Data Preprocessing

In [32]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


def preprocess(text):
    
    text = re.sub(r'[^a-zA-Z\s]', '', text, re.I|re.A)
    text = text.lower()
    
    
    tokens = [word for word in text.split() if word not in stop_words]
    return ' '.join(tokens)


df['clean_text'] = df['article'].apply(preprocess)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PrideGod\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Summarize

In [3]:
def summarize(text):
    sentences = sent_tokenize(text)
    
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(sentences)
    
    sentence_scores = np.array(tfidf_matrix.mean(axis=1)).flatten()  
    
    
    top_n = 3  
    top_sentences_idx = np.argsort(sentence_scores)[::-1][:top_n]
    
    
    summary = [sentences[i] for i in top_sentences_idx]
    return ' '.join(summary)


df['summary'] = df['article'].apply(summarize)

print(df['summary'])

0         The bishop of the Fargo Catholic Diocese in No...
1         Court documents released by investigators do n...
2         ‘Mr Eccleston-Todd took the decision to pick u...
3         And such a strategy might also turn out to hav...
4         Bristol City, who beat Nigel Clough’s men on t...
                                ...                        
287108    'If at some point that weren't the case, and I...
287109    The rapper turned renovation show reality star...
287110    Oscar-tipped: Clint Eastwood's (left) movie is...
287111    'Some insects, especially in the larval stage,...
287112    That moderation on immigration reform — and hi...
Name: summary, Length: 287113, dtype: object


In [4]:
print(df[['article', 'summary']].head())

                                             article  \
0  By . Associated Press . PUBLISHED: . 14:11 EST...   
1  (CNN) -- Ralph Mata was an internal affairs li...   
2  A drunk driver who killed a young woman in a h...   
3  (CNN) -- With a breezy sweep of his pen Presid...   
4  Fleetwood are the only team still to have a 10...   

                                             summary  
0  The bishop of the Fargo Catholic Diocese in No...  
1  Court documents released by investigators do n...  
2  ‘Mr Eccleston-Todd took the decision to pick u...  
3  And such a strategy might also turn out to hav...  
4  Bristol City, who beat Nigel Clough’s men on t...  


In [5]:
df[['article', 'summary']].head().to_csv('summary.csv', index=False)