In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
pd.set_option('display.max_colwidth', -1)

In [2]:
ls

abcnews-date-text.csv  [0m[01;31mabcnews-date-text.csv.zip[0m  Text_Clustering_K_Means.ipynb


# Dataset

* news headlines published over a period of 15 years
* ABC (Australian Broadcasting Corp.) Site: http://www.abc.net.au/ prepared by Rohit Kulkarni

In [3]:
data = pd.read_csv("abcnews-date-text.csv")
print('Shape : ', data.shape)
data.head()

Shape :  (1103665, 2)


Unnamed: 0,publish_date,headline_text
0,20030219,aba decides against community broadcasting licence
1,20030219,act fire witnesses must be aware of defamation
2,20030219,a g calls for infrastructure protection summit
3,20030219,air nz staff in aust strike for pay rise
4,20030219,air nz strike to affect australian travellers


# Remove duplicates

In [4]:
# keep = False  --- mark all duplicates True
# keep = 'first' --- mark duplicates except the first occurrance as True
# keep = 'last' --- mark duplicates except the last occurrance as True 

data[data['headline_text'].duplicated(keep=False)].sort_values('headline_text').head(10)

Unnamed: 0,publish_date,headline_text
57973,20031129,10 killed in pakistan bus crash
116304,20040920,10 killed in pakistan bus crash
912357,20141023,110 with barry nicholls
673104,20120217,110 with barry nicholls
676569,20120302,110 with barry nicholls
748865,20121214,110 with barry nicholls
827317,20131017,110 with barry nicholls episode 15
898182,20140820,110 with barry nicholls episode 15
899506,20140826,110 with barry nicholls episode 16
827318,20131017,110 with barry nicholls episode 16


In [5]:
data.drop_duplicates(subset='headline_text', keep='first', inplace=True)

In [6]:
data[data['headline_text'].duplicated(keep=False)].sort_values('headline_text').head(10)

Unnamed: 0,publish_date,headline_text


# NLP

# TF-IDF

In [7]:
punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]

stop_words = text.ENGLISH_STOP_WORDS.union(punc)
vectorizer = TfidfVectorizer(stop_words = stop_words)
X = vectorizer.fit_transform(data['headline_text'].values)

X

<1076225x96397 sparse matrix of type '<class 'numpy.float64'>'
	with 5525887 stored elements in Compressed Sparse Row format>

In [8]:
print('Datset : ', data.shape)
print('TF-IDF Matrix :', X.shape)

Datset :  (1076225, 2)
TF-IDF Matrix : (1076225, 96397)


In [9]:
word_features = vectorizer.get_feature_names()
print('No. of word features : ', len(word_features))
print()
print(word_features[5000:5100])

No. of word features :  96397

['abyss', 'ac', 'aca', 'acacia', 'acacias', 'acadamy', 'academia', 'academic', 'academics', 'academies', 'academy', 'academys', 'acai', 'acapulco', 'acars', 'acason', 'acasuso', 'acb', 'acbf', 'acc', 'acca', 'accan', 'accc', 'acccc', 'acccs', 'acccused', 'acce', 'accedes', 'accelerant', 'accelerants', 'accelerate', 'accelerated', 'accelerates', 'accelerating', 'acceleration', 'accelerator', 'accen', 'accent', 'accents', 'accentuate', 'accentuates', 'accentuating', 'accenture', 'accept', 'acceptability', 'acceptable', 'acceptably', 'acceptance', 'acceptances', 'accepted', 'accepting', 'acceptor', 'acceptors', 'accepts', 'accerate', 'acces', 'access', 'accessary', 'accessed', 'accesses', 'accessibility', 'accessible', 'accessing', 'accessories', 'accessory', 'accesss', 'acci', 'accid', 'accide', 'acciden', 'accidenatlly', 'accidenbt', 'accident', 'accidental', 'accidentally', 'accidently', 'accidents', 'acciona', 'accis', 'acclaim', 'acclaimed', 'acclamatio

# Stemming and Tokenizing

In [10]:
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

In [11]:
vectorizer2 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize)
X2 = vectorizer2.fit_transform(data['headline_text'].values)
word_features2 = vectorizer2.get_feature_names()

print("No. of word features : ", len(word_features2))
print()
print(word_features2[:50]) 

No. of word features :  65232

["'a", "'i", "'s", "'t", 'aa', 'aaa', 'aaahhh', 'aac', 'aacc', 'aaco', 'aacta', 'aad', 'aadmi', 'aag', 'aagaard', 'aagard', 'aah', 'aalto', 'aam', 'aamer', 'aami', 'aamodt', 'aandahl', 'aant', 'aap', 'aapa', 'aapt', 'aar', 'aaradhna', 'aardman', 'aardvark', 'aargau', 'aaron', 'aaronpaul', 'aarwun', 'aat', 'ab', 'aba', 'abaaoud', 'ababa', 'aback', 'abadi', 'abadon', 'abal', 'abalon', 'abalonv', 'abama', 'abandon', 'abandond', 'abandong']


In [12]:
#max_features = If not None, build a vocabulary that only consider the 
#top max_features ordered by term frequency across the corpus.

vectorizer3 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize, max_features = 1000)
X3 = vectorizer3.fit_transform(data['headline_text'].values)
words = vectorizer3.get_feature_names()

print("No. of word features : ", len(words))
print()
print(words[:50]) 

No. of word features :  1000

['abbott', 'abc', 'aborigin', 'abus', 'access', 'accid', 'accus', 'act', 'action', 'ad', 'address', 'adelaid', 'admit', 'affect', 'afghan', 'afghanistan', 'afl', 'africa', 'age', 'agre', 'agreement', 'ahead', 'aid', 'aim', 'air', 'airport', 'al', 'alcohol', 'alert', 'alic', 'alleg', 'allow', 'alp', 'ambul', 'amid', 'andrew', 'anger', 'anim', 'announc', 'anoth', 'anti', 'anzac', 'appeal', 'appear', 'appoint', 'approv', 'area', 'arm', 'armi', 'arrest']
