In [1]:
import pandas as pd
import numpy as np
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
nltk.download("punkt")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to C:\Users\Deepam
[nltk_data]     Shah\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Deepam
[nltk_data]     Shah\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [3]:
import pandas as pd
import pathlib

root_dir = pathlib.Path(r"D:\Deepam\bbc")

data = []

for category_dir in root_dir.iterdir():
    if category_dir.is_dir():
        label = category_dir.name

        for file_path in category_dir.glob("*.txt"):
            with open(file_path, "r", encoding="latin-1") as f:
                text = f.read()
                data.append({"text": text, "labels": label})

df = pd.DataFrame(data)

In [4]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
doc = df[df.labels == 'business']['text'].sample(random_state=42)

In [6]:
def wrap(x):
    return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [7]:
print(wrap(doc.iloc[0]))

Christmas sales worst since 1981

UK retail sales fell in December,
failing to meet expectations and making it by some counts the worst
Christmas since 1981.

Retail sales dropped by 1% on the month in
December, after a 0.6% rise in November, the Office for National
Statistics (ONS) said.  The ONS revised the annual 2004 rate of growth
down from the 5.9% estimated in November to 3.2%. A number of
retailers have already reported poor figures for December.  Clothing
retailers and non-specialist stores were the worst hit with only
internet retailers showing any significant growth, according to the
ONS.

The last time retailers endured a tougher Christmas was 23 years
previously, when sales plunged 1.7%.

The ONS echoed an earlier
caution from Bank of England governor Mervyn King not to read too much
into the poor December figures.  Some analysts put a positive gloss on
the figures, pointing out that the non-seasonally-adjusted figures
showed a performance comparable with 2003. The Novembe

In [8]:
sents = nltk.sent_tokenize(doc.iloc[0].split("\n", 1)[1])

* Step-by-step Breakdown:
Assume:
```python
doc = pd.Series([
    "140702\nIndia is a beautiful country. It has diverse culture and history."
])
So,

doc.iloc[0] = "140702\nIndia is a beautiful country. It has diverse culture and history."
🪛 Part 1: doc.iloc[0]
This accesses the first element (row) of the pandas Series doc.
Result:

"140702\nIndia is a beautiful country. It has diverse culture and history."
🪛 Part 2: .split("\n", 1)
This splits the string at the first newline character \n.

doc.iloc[0].split("\n", 1)
Gives:

['140702', 'India is a beautiful country. It has diverse culture and history.']
🪛 Part 3: [1]
This selects the second part of the split, i.e. the actual text content (after the ID or number):

'India is a beautiful country. It has diverse culture and history.'
🪛 Part 4: nltk.sent_tokenize(...)
This breaks the paragraph into individual sentences using NLTK’s sentence tokenizer.

nltk.sent_tokenize("India is a beautiful country. It has diverse culture and history.")
Gives:

["India is a beautiful country.", "It has diverse culture and history."]
✅ Final Output:

sents = ["India is a beautiful country.", "It has diverse culture and history."]
```
* Purpose of This Line:
    * You're skipping the ID or label (before \n), extracting just the text, and splitting it into sentences for further processing (e.g., sentence-level predictions, attention visualization, etc.)



In [11]:
featurizer = TfidfVectorizer(
    stop_words=stopwords.words('english'),
    norm='l1',
)

### What does norm='l1' mean?
* It means L1 normalization (also called Manhattan norm or taxicab norm):
* For each document vector, the sum of absolute values of all elements = 1
* This makes each row (i.e. each document) scaled so all its TF-IDF values sum to 1.

In [16]:
X = featurizer.fit_transform(sents)

In [17]:
def get_sentence_score(tfidf_row):
    # return the average of the non-zero values
    # of the tf-idf vector representation of a sentence
    x = tfidf_row[tfidf_row != 0]
    return x.mean()

In [19]:
scores = np.zeros(len(sents))
for i in range(len(sents)):
    score = get_sentence_score(X[i,:]) # “Take the i-th row from matrix X, and all columns in that row.”
    scores[i] = score

In [20]:
sort_idx = np.argsort(-scores)

In [23]:
# Many options for how to choose which sentence to include:

# 1) top N sentences
# 2) top N words or characters
# 3) top X% sentences or top X% words
# 4) sentence with scores > average score
# 5) sentences with scores > factor * average score

# You also don't have to sort. May make more sense in order.

print("Generated summary:")
for i in sort_idx[:5]:
    print(wrap("%.2f: %s" % (scores[i], sents[i])))

Generated summary:
0.14: A number of retailers have already reported poor figures for
December.
0.13: However, reports from some High Street retailers highlight the
weakness of the sector.
0.12: The ONS revised the annual 2004 rate of growth down from the
5.9% estimated in November to 3.2%.
0.10: "Our view is the Bank of England will keep its powder dry and
wait to see the big picture."
0.10: And a British Retail Consortium survey found that Christmas 2004
was the worst for 10 years.


* "%.2f: %s" % (scores[i], sents[i])
This is string formatting using the % operator.

* "%.2f" → formats the float value to 2 decimal places
(e.g., 0.845245 → 0.85)

* "%s" → inserts the sentence as-is

* Final result looks like:
```python
"0.85: India has seen economic growth."
✅ Equivalent using f-string:

f"{scores[i]:.2f}: {sents[i]}"
3. wrap(...)
This uses the textwrap.fill() function (from the textwrap module).

It wraps long text into multiple lines so it doesn’t overflow when printing in the terminal.

Useful when sentences are long.

Example:

Without wrap:

print("0.85: India has seen economic growth. GDP rose steadily in the last 3 quarters across all sectors.")
With wrap:

print(wrap("0.85: India has seen economic growth. GDP rose steadily in the last 3 quarters across all sectors."))
Output (wrapped neatly):

0.85: India has seen economic growth. GDP rose steadily in
the last 3 quarters across all sectors.
```

In [24]:
doc.iloc[0].split("\n", 1)[0]

'Christmas sales worst since 1981'

In [27]:
def summarize(text):
    # extract sentences
    sents = nltk.sent_tokenize(text)

    # perform tf-idf
    X = featurizer.fit_transform(sents)

    # compute scores for each sentence
    scores = np.zeros(len(sents))
    for i in range(len(sents)):
        score = get_sentence_score(X[i,:])
        scores[i] = score

    # sort the scores
    sort_idx = np.argsort(-scores)

    # print summary
    for i in sort_idx[:5]:
        print(wrap("%.2f: %s" % (scores[i], sents[i])))

In [28]:
doc = df[df.labels == 'entertainment']['text'].sample(random_state=123)
summarize(doc.iloc[0].split("\n", 1)[1])

0.11: The Black Eyed Peas won awards for best R 'n' B video and
sexiest video, both for Hey Mama.
0.10: The ceremony was held at the Luna Park fairground in Sydney
Harbour and was hosted by the Osbourne family.
0.10: The VH1 First Music Award went to Cher honouring her
achievements within the music industry.
0.10: Goodrem, Green Day and the Black Eyed Peas took home two awards
each.
0.10: Other winners included Green Day, voted best group, and the
Black Eyed Peas.


In [29]:
doc.iloc[0].split("\n", 1)[0]

'Goodrem wins top female MTV prize'

In [30]:
print(wrap(doc.iloc[0]))

Goodrem wins top female MTV prize

Pop singer Delta Goodrem has
scooped one of the top individual prizes at the first Australian MTV
Music Awards.

The 21-year-old singer won the award for best female
artist, with Australian Idol runner-up Shannon Noll taking the title
of best male at the ceremony.  Goodrem, known in both Britain and
Australia for her role as Nina Tucker in TV soap Neighbours, also
performed a duet with boyfriend Brian McFadden.  Other winners
included Green Day, voted best group, and the Black Eyed Peas.
Goodrem, Green Day and the Black Eyed Peas took home two awards each.
As well as best female, Goodrem also took home the Pepsi Viewers
Choice Award, whilst Green Day bagged the prize for best rock video
for American Idiot.  The Black Eyed Peas won awards for best R 'n' B
video and sexiest video, both for Hey Mama.  Local singer and
songwriter Missy Higgins took the title of breakthrough artist of the
year, with Australian Idol winner Guy Sebastian taking the honours f