In [1]:
import numpy as np
import pandas as pd
import textwrap
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification
#!wget -nc https://lazyprogrammer.me/course_files/nlp/bbc_text_cls.csv

In [3]:
df = pd.read_csv('bbc_text_cls.csv')
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [4]:
doc = df[df.labels == 'business']['text'].sample(random_state=42)

In [5]:
def wrap(x):
  return textwrap.fill(x, replace_whitespace=False, fix_sentence_endings=True)

In [6]:
print(wrap(doc.iloc[0]))

Christmas sales worst since 1981

UK retail sales fell in December,
failing to meet expectations and making it by some counts the worst
Christmas since 1981.

Retail sales dropped by 1% on the month in
December, after a 0.6% rise in November, the Office for National
Statistics (ONS) said.  The ONS revised the annual 2004 rate of growth
down from the 5.9% estimated in November to 3.2%. A number of
retailers have already reported poor figures for December.  Clothing
retailers and non-specialist stores were the worst hit with only
internet retailers showing any significant growth, according to the
ONS.

The last time retailers endured a tougher Christmas was 23 years
previously, when sales plunged 1.7%.

The ONS echoed an earlier
caution from Bank of England governor Mervyn King not to read too much
into the poor December figures.  Some analysts put a positive gloss on
the figures, pointing out that the non-seasonally-adjusted figures
showed a performance comparable with 2003. The Novembe

In [7]:
sents = nltk.sent_tokenize(doc.iloc[0].split('\n',1)[1])

In [8]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'),norm='l1')

In [9]:
X = vectorizer.fit_transform(sents)

In [10]:
def get_sentence_score(tfidf_row):
  x = tfidf_row[tfidf_row != 0]
  return x.mean()

In [11]:
scores = np.zeros(len(sents))
for i in range(len(sents)):
  score = get_sentence_score(X[i,:])
  scores[i] = score

In [12]:
sort_idx = np.argsort(-scores)

In [13]:
print('generated summary:')
for i in sort_idx[:5]:
  print(wrap('%.2f: %s'% (scores[i],sents[i])))

generated summary:
0.14: A number of retailers have already reported poor figures for
December.
0.13: However, reports from some High Street retailers highlight the
weakness of the sector.
0.12: The ONS revised the annual 2004 rate of growth down from the
5.9% estimated in November to 3.2%.
0.10: "Our view is the Bank of England will keep its powder dry and
wait to see the big picture."
0.10: And a British Retail Consortium survey found that Christmas 2004
was the worst for 10 years.


In [14]:
doc.iloc[0].split('\n', 1)[0]

'Christmas sales worst since 1981'

In [16]:
def summarize(text):
  sents = nltk.sent_tokenize(text)

  X = vectorizer.fit_transform(sents)

  scores = np.zeros(len(sents))

  for i in range(len(sents)):
    score = get_sentence_score(X[i, :])
    scores[i] = score

  sort_idx = np.argsort(-scores)

  for i in sort_idx[:5]:
    print(wrap('%.2f: %s' % (scores[i], sents[i])))



In [17]:
doc =df[df.labels == 'entertainment']['text'].sample(random_state=42)
summarize(doc.iloc[0].split('\n', 1)[1])

0.33: Almost 17 years now.
0.33: I've done quite a few films."
0.33: A retrospective of his movies was shown.
0.33: "It's what I want to do for the rest of my life."
0.25: "What's really exciting, for me, is that this is what I really
love doing," he added.


  return self.astype(np.float_)._mul_scalar(1./other)


In [18]:
doc.iloc[0].split('\n', 1)[0]

'Career honour for actor DiCaprio'

In [19]:
print(wrap(doc.iloc[0]))

Career honour for actor DiCaprio

Actor Leonardo DiCaprio's
"exceptional career" has been honoured at the Santa Barbara
International Film Festival.

The star was presented with the award by
Martin Scorsese, who directed him in Oscar-nominated movie The
Aviator.  "It's a lifetime achievement award, which is completely and
utterly surreal, given I'm only 30 years old," DiCaprio said.  "But
what has it been?  Almost 17 years now.  I've done quite a few films."
A retrospective of his movies was shown.

"What's really exciting, for
me, is that this is what I really love doing," he added.  "It's what I
want to do for the rest of my life."  DiCaprio began his movie career
in horror film Critters 3, before moving onto roles in The Basketball
Diaries, Romeo and Juliet, Titanic and Gangs of New York.  The
achievement award was created to commemorate the California festival's
20th anniversary and coincided with DiCaprio's portrayal of
millionaire Howard Hughes in The Aviator.

Veteran actress Ja