In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import re,string

In [2]:
df = pd.read_csv('./static/dataset/bbc-text-filtered.csv')
print(df.shape, df['category'].nunique())
df.head()

(973, 3) 3


Unnamed: 0.1,Unnamed: 0,category,text
0,1,sport,tigers wary of farrell gamble leicester say ...
1,2,sport,yeading face newcastle in fa cup premiership s...
2,3,politics,howard hits back at mongrel jibe michael howar...
3,4,health,some 34% of doctors joining the health service...
4,5,health,"it found england is now short of 12,000 hospit..."


In [3]:
df['category'].value_counts()

sport       504
politics    415
health       54
Name: category, dtype: int64

In [4]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("stopwords")
from nltk.corpus import stopwords

sw = stopwords.words('english')
print(sw)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to /Users/arjun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
def removeStopWords(text):
   text = text.lower()
   text = re.sub('[^a-z ]+','', text)
   text = [
       word
       for word in word_tokenize(text)
       if not word in set(stopwords.words("english"))
   ]
   text = " ".join(text)
   return text

In [7]:
df["text"] = df["text"].apply(lambda text: removeStopWords(text))

In [8]:
df

Unnamed: 0.1,Unnamed: 0,category,text
0,1,sport,tigers wary farrell gamble leicester say rushe...
1,2,sport,yeading face newcastle fa cup premiership side...
2,3,politics,howard hits back mongrel jibe michael howard s...
3,4,health,doctors joining health service last year came ...
4,5,health,found england short hospital doctors nurses mi...
...,...,...,...
968,1004,politics,kilroy unveils immigration policy exchatshow h...
969,1005,politics,political squabbles snowball become commonplac...
970,1006,sport,souness delight euro progress boss graeme soun...
971,1007,health,people times likely get covid airborne viral p...


In [9]:
vec = TfidfVectorizer()
attributes= vec.fit_transform(df['text'])
print("Input features shape:",attributes.shape)
print(f"\nTake a look at the features extracted from the first news article:\n{attributes[0].toarray()}")

Input features shape: (973, 16846)

Take a look at the features extracted from the first news article:
[[0. 0. 0. ... 0. 0. 0.]]


In [10]:
from sklearn.cluster import KMeans
kmeans = KMeans(3,n_init=1,random_state=150)
kmeans.fit(attributes)
df['cluster'] = kmeans.labels_

In [11]:
df.head(20)

Unnamed: 0.1,Unnamed: 0,category,text,cluster
0,1,sport,tigers wary farrell gamble leicester say rushe...,0
1,2,sport,yeading face newcastle fa cup premiership side...,0
2,3,politics,howard hits back mongrel jibe michael howard s...,2
3,4,health,doctors joining health service last year came ...,1
4,5,health,found england short hospital doctors nurses mi...,1
5,6,politics,hague given pm ambition former conservative le...,2
6,7,sport,moya emotional davis cup win carlos moya descr...,0
7,8,politics,howard backs stem cell research michael howard...,1
8,9,sport,connors boost british tennis former world numb...,0
9,10,sport,bates seals takeover ken bates completed takeo...,0


In [12]:
for cat in df['category'].unique():
    mark = df['category'] == cat
    print(f"{cat}\n{df[mark]['cluster'].value_counts()}\n")

sport
0    398
1    106
Name: cluster, dtype: int64

politics
2    219
1    196
Name: cluster, dtype: int64

health
1    54
Name: cluster, dtype: int64



In [13]:
category = {}
for ctg in df['category'].unique():
    mark = df['category'] == ctg
    top = df[mark]['cluster'].value_counts().head(1)
    count = top.values[0]
    cluster = top.index[0]
    print(f"{cat}:\n Top cluster number: {cluster}, Number of samples: {count}")
    category[cluster] = ctg

print("\nMap cluster number to category:")
category

health:
 Top cluster number: 0, Number of samples: 398
health:
 Top cluster number: 2, Number of samples: 219
health:
 Top cluster number: 1, Number of samples: 54

Map cluster number to category:


{0: 'sport', 2: 'politics', 1: 'health'}

In [14]:
print("Categories of the data")
category

Categories of the data


{0: 'sport', 2: 'politics', 1: 'health'}

In [15]:
df['predicted'] = df['cluster'].map(category)
df.head(20)

Unnamed: 0.1,Unnamed: 0,category,text,cluster,predicted
0,1,sport,tigers wary farrell gamble leicester say rushe...,0,sport
1,2,sport,yeading face newcastle fa cup premiership side...,0,sport
2,3,politics,howard hits back mongrel jibe michael howard s...,2,politics
3,4,health,doctors joining health service last year came ...,1,health
4,5,health,found england short hospital doctors nurses mi...,1,health
5,6,politics,hague given pm ambition former conservative le...,2,politics
6,7,sport,moya emotional davis cup win carlos moya descr...,0,sport
7,8,politics,howard backs stem cell research michael howard...,1,health
8,9,sport,connors boost british tennis former world numb...,0,sport
9,10,sport,bates seals takeover ken bates completed takeo...,0,sport


In [16]:
nltk.download("punkt")
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer





[nltk_data] Downloading package punkt to /Users/arjun/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [22]:
portStemmer = PorterStemmer()

testDoc = ["hague  given up  his pm ambition former conservative leader william hague says he will not stand for the leadership again  having given up his ambition to be prime minister.  mr hague  43  told the daily telegraph he would now find a life dominated by politics too  boring  and unfulfilling. mr hague  who stepped down after his party s 2001 election defeat  does not rule out a return to the front bench. he also told the paper he hopes to remain mp for richmond  north yorks  and start a family with wife ffion. mr hague  who recently had published the biography of william pitt the younger  also said he wanted to continue writing books and speech-writing.  he told the newspaper:  i don t know whether i will ever go back on to the front  but don t rush me.  asked if he would stand for the leadership again  mr hague replied:  no. definitely not.  his determination to stay away from a central role will disappoint some senior conservative members  who say the party needs him. tim collins  the shadow education secretary  said last week it would be a  huge boost  to the party if mr hague returned to the front bench. mr hague became an mp at 27 and leader of the opposition at 36. he said:  i feel fortunate that  by the age of 40  i had crammed in an entire political career.  i had been in the cabinet and been leader of the party  so now i can branch out into other things...it is a very liberating feeling.  mr hague added that he may have misjudged his own ambition to be prime minister.  maybe i wasn t as driven by politics as i thought i was   he said.",
          "to the people who say im tired when someone asks how you latest figures show nearly one in 10 nursing and midwifery posts are vacant.  the royal college of nursing scotland says health boards failing to accommodate staff who want to change their working patterns rather than leave is making the problem worse.",
          "moya emotional after davis cup win carlos moya described spain s davis cup victory as the highlight of his career after he beat andy roddick to end the usa s challenge in seville.  moya made up for missing spain s 2000 victory through injury by beating roddick 6-2 7-6 (7-1) 7-6 (7-5) to give the hosts an unassailable 3-1 lead.  i have woken up so many nights dreaming of this day   said moya.  all my energy has been focused on today.  what i have lived today i do not think i will live again.  spain s only other davis cup title came two years ago in valencia  when they beat australia. and moya  nicknamed charly  admitted:  the davis cup is my dream and i was a bit nervous at the outset.  some people have said that i am obsessed but i think that it is better this way. it helps me reach my goals if i am obsessed.  it s really incredible - to get the winning point is really something.  spanish captain jordi arrese said:  charly played a great game. it was his opportunity and he hasn t let us down.  he had lost three times to roddick  and this was his day to beat him.  he had been waiting years to be in this position.  spain s victory was also remarkable for the performance of rafael nadal  who beat roddick in the opening singles.  aged 18 years and 185 days  the mallorcan became the youngest player to win the davis cup.  what a great way to finish the year   said nadal afterwards. us coach patrick mcenroe wants roddick and the rest of his team to play more tennis on clay and hone their skills on the surface.  i think it will help these guys even on slow hard courts to learn how to mix things up a little bit and to play a little bit smarter and tactically better.   obviously it s unrealistic to say that we re going to just start playing constantly on clay  with the schedule.  but certainly i think we can put the work in at the appropriate time and play a couple more events and play against these guys who are the best on this stuff   said mcenroe. roddick was left frustrated after losing both his singles on the slow clay of seville s olympic stadium.  it s just tough because i felt like i was in it the whole time against one of the top three clay-courters in the world   said the american.  i had my chances and just didn t convert them. the bottom line is they were just better than us this weekend.  they came out  took care of business and they beat us. it s as simple as that."]
filteredTestDocs = []
for doc in testDoc:
    tokens = word_tokenize(doc)
    tmp = ""
    for word in tokens:
        if word not in sw:
            tmp += portStemmer.stem(word) + " "
    filteredTestDocs.append(tmp)

print("Document\n"+testDoc[0])
print()
print("Filter:\n"+filteredTestDocs[0])
Y = vec.transform([filteredTestDocs[0]])
prediction = kmeans.predict(Y)
print()
print("Predicted Class:" +str(category[int(prediction)]))
print()
print("Document\n"+testDoc[1])
print()
print("Filter:\n"+filteredTestDocs[1])
Y = vec.transform([filteredTestDocs[1]])
prediction = kmeans.predict(Y)
print()
print("Predicted Class:" +str(category[int(prediction)]))
print()
print("Document\n"+testDoc[2])
print()
print("Filter:\n"+filteredTestDocs[2])
Y = vec.transform([filteredTestDocs[2]])
prediction = kmeans.predict(Y)
print()
print("Predicted Class:" +str(category[int(prediction)]))




Document
hague  given up  his pm ambition former conservative leader william hague says he will not stand for the leadership again  having given up his ambition to be prime minister.  mr hague  43  told the daily telegraph he would now find a life dominated by politics too  boring  and unfulfilling. mr hague  who stepped down after his party s 2001 election defeat  does not rule out a return to the front bench. he also told the paper he hopes to remain mp for richmond  north yorks  and start a family with wife ffion. mr hague  who recently had published the biography of william pitt the younger  also said he wanted to continue writing books and speech-writing.  he told the newspaper:  i don t know whether i will ever go back on to the front  but don t rush me.  asked if he would stand for the leadership again  mr hague replied:  no. definitely not.  his determination to stay away from a central role will disappoint some senior conservative members  who say the party needs him. tim coll