In [2]:
import pandas as pd
import string, re
import numpy as np

# Reading and understanding the data

In [3]:
# Read in the data
df = pd.read_csv('../Data/BBC-articles.csv')

In [4]:
df.head(10)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
5,politics,howard hits back at mongrel jibe michael howar...
6,politics,blair prepares to name poll date tony blair is...
7,sport,henman hopes ended in dubai third seed tim hen...
8,sport,wilkinson fit to face edinburgh england captai...
9,entertainment,last star wars not for children the sixth an...


## You can do this using LSI/LSA and LDA algorithms, after vectorizing the text using TF-IDF vector in three different ways:

- TF-IDF after normal cleaning of the text corpus (punctuation removal, stopword removal, etc.)
- TF-IDF with term frequency filter, to exclude the top 10% of the most frequent words and words that appear less than 5 times in the documents.
- TF-IDF limited to nouns, noun phrases, and named entity recognition only.

In [5]:
## lets use nltk to extract the most significant words from each article to also use TF-IDF and LSI and LDA

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# stopwords from gensim
from gensim.parsing.preprocessing import STOPWORDS

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### TF-IDF after normal cleaning of the text corpus (punctuation removal, stopword removal, etc.)

In [6]:
def normal_clean(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove whitespace
    text = text.strip()
    # remove stopwords with gensim
    text = [word for word in text.split() if word not in STOPWORDS]
    return text

In [7]:
# clean the text
df['clean_text1'] = df['text'].apply(normal_clean)

df.head(10)

Unnamed: 0,category,text,clean_text1
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war..."


In [8]:
# TF-IDF after normal cleaning of the text corpus (punctuation removal, stopword removal, etc.)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
# create a vectorizer object
vectorizer_1 = TfidfVectorizer(stop_words='english', max_features= 2000, max_df = 0.5, smooth_idf=True)

# fit and transform the vectorizer on the text
X = vectorizer_1.fit_transform(df['clean_text1'].apply(lambda x: ' '.join(x)))

In [10]:
# add vectorizer to dataframe
df['tfidf_1'] = [x for x in X.toarray()]
df.head(10)

Unnamed: 0,category,text,clean_text1,tfidf_1
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.061..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,...","[0.0, 0.032974084459105844, 0.0, 0.0, 0.0, 0.0..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### TF-IDF with term frequency filter, to exclude the top 10% of the most frequent words and words that appear less than 5 times in the documents.

In [11]:
# TF-IDF with term frequency filtering, we will exluding the top 10% of terms with the highest term frequency for the entire corpus

word_frequencies = dict()

for text in df['text']:
    for word in text.split():
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1


sorted(list(word_frequencies.items()), key=lambda x: x[1], reverse=True)[:10]

[('the', 52567),
 ('to', 24955),
 ('of', 19947),
 ('and', 18561),
 ('a', 18251),
 ('in', 17570),
 ('s', 9007),
 ('for', 8884),
 ('is', 8515),
 ('that', 8135)]

In [12]:
top_10_percent = int(len(word_frequencies) * 0.1)

In [13]:
# remove the top_10_percent words from the word_frequencies dictionary
for word in sorted(word_frequencies, key=word_frequencies.get, reverse=True)[:top_10_percent]:
    del word_frequencies[word]

sorted(list(word_frequencies.items()), key=lambda x: x[1], reverse=True)[:10]

[('profits.', 19),
 ('recovered', 19),
 ('strategist', 19),
 ('consortium', 19),
 ('avoided', 19),
 ('motor', 19),
 ('bank.', 19),
 ('hopeful', 19),
 ('christmas.', 19),
 ('attachment', 19)]

In [14]:
len(word_frequencies)

39394

In [15]:
# remove any item in word_frequencies that appears less than 5 times
for word in list(word_frequencies):
    if word_frequencies[word] < 5:
        del word_frequencies[word]

len(word_frequencies)

8022

In [16]:
# lets tokenize the text and only add words that are in the word_frequencies dictionary

def tokenize_text(text):
    tokens = text.split()
    tokens = [token for token in tokens if token in word_frequencies]

    return tokens


In [17]:
# create new clean_text2 column using the tokenize_text function
df['clean_text2'] = df['text'].apply(lambda x: tokenize_text(x))

In [18]:
df.head(10)

Unnamed: 0,category,text,clean_text1,tfidf_1,clean_text2
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030...","[plasma, tvs, radically, rooms, set-top, tivo,..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[bernie, overseeing, $11bn, witness, myers, co..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, wary, gamble, rushed, tigers, wells, ..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[arguably, slough, exeter, trafford, holders, ..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.061...","[twelve, raids, twelve, clooney, brad, pitt, j..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,...","[0.0, 0.032974084459105844, 0.0, 0.0, 0.0, 0.0...","[mongrel, jibe, hain, mongrel, rattled, opposi..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[prepares, marr, resisted, recently., frantic,..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[quarter-final, ivan, croatian, halted, rain, ..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[long-awaited, 25-year-old, murrayfield, bench..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[suitable, film-maker, lucas, revenge, violent..."


In [29]:
# create a vectorizer object
vectorizer_2 = TfidfVectorizer(stop_words='english', max_features= 2000, max_df = 0.5, smooth_idf=True)

# fit and transform the vectorizer on the text
X = vectorizer_2.fit_transform(df['clean_text2'].apply(lambda x: ' '.join(x)))

# add vectorizer to dataframe
df['tfidf_2'] = [x for x in X.toarray()]

In [30]:
df.head(10)

Unnamed: 0,category,text,clean_text1,tfidf_1,clean_text2,tfidf_2
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030...","[plasma, tvs, radically, rooms, set-top, tivo,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[bernie, overseeing, $11bn, witness, myers, co...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2767406767597..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, wary, gamble, rushed, tigers, wells, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[arguably, slough, exeter, trafford, holders, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.061...","[twelve, raids, twelve, clooney, brad, pitt, j...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,...","[0.0, 0.032974084459105844, 0.0, 0.0, 0.0, 0.0...","[mongrel, jibe, hain, mongrel, rattled, opposi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[prepares, marr, resisted, recently., frantic,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[quarter-final, ivan, croatian, halted, rain, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[long-awaited, 25-year-old, murrayfield, bench...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[suitable, film-maker, lucas, revenge, violent...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [38]:
# LSI with normal cleaning of the text corpus (punctuation removal, stopword removal, etc.)

from gensim import corpora, models
from gensim.models import CoherenceModel
import gensim

In [39]:
# Use LSI on the tfidf_1 column
# create a dictionary from the 'clean_text1' column
dictionary = corpora.Dictionary(df['clean_text1'])
# convert the dictionary to a bag of words corpus for reference
corpus = [dictionary.doc2bow(text) for text in df['clean_text1']]

# create the tfidf model
tfidf = models.TfidfModel(corpus) # step 1 -- initialize a model
corpus_tfidf = tfidf[corpus] # step 2 -- use the model to transform vectors

# create the LSI model
lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5) # initialize an LSI transformation
corpus_lsi = lsi[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lsi
# print the topics
lsi.print_topics(5) # both bow->tfidf and tfidf->lsi transformations are actually executed here, on the fly

[(0,
  '0.172*"mr" + 0.144*"labour" + 0.125*"blair" + 0.120*"election" + 0.117*"brown" + 0.098*"party" + 0.098*"government" + 0.090*"people" + 0.089*"tax" + 0.076*"howard"'),
 (1,
  '0.269*"labour" + 0.227*"blair" + 0.218*"election" + 0.207*"brown" + 0.187*"mr" + 0.170*"party" + 0.144*"tax" + 0.129*"chancellor" + 0.123*"howard" + -0.118*"film"'),
 (2,
  '-0.145*"mobile" + 0.138*"england" + -0.116*"growth" + 0.106*"o" + 0.098*"wales" + 0.097*"best" + 0.095*"win" + -0.094*"economy" + -0.092*"broadband" + 0.092*"ireland"'),
 (3,
  '-0.288*"film" + -0.174*"best" + -0.148*"awards" + 0.137*"england" + -0.123*"award" + 0.120*"economy" + 0.119*"growth" + -0.116*"music" + 0.104*"yukos" + -0.097*"actor"'),
 (4,
  '-0.237*"film" + 0.167*"mobile" + -0.167*"best" + -0.134*"dollar" + -0.126*"economy" + -0.121*"awards" + -0.114*"growth" + -0.112*"yukos" + 0.112*"phone" + -0.105*"oil"')]

In [42]:
# Add the LSI topics to the dataframe
df['lsi_1'] = [x for x in corpus_lsi]
df.head(10)

Unnamed: 0,category,text,clean_text1,tfidf_1,clean_text2,tfidf_2,clean_text3,tfidf_3,lsi_1
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030...","[plasma, tvs, radically, rooms, set-top, tivo,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.17381940749806418), (1, -0.097430843999..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[bernie, overseeing, $11bn, witness, myers, co...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2767406767597...","[worldcom, boss, books, worldcom, boss, bernie...","[0.0, 0.0, 0.0, 0.04777469913386982, 0.0, 0.0,...","[(0, 0.09791059841518036), (1, 0.0166372555620..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, wary, gamble, rushed, tigers, wells, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, farrell, gamble, leicester, bid, andy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.09746965699244897), (1, -0.061448447573..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[arguably, slough, exeter, trafford, holders, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[newcastle, fa, cup, premiership, side, newcas...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.05383926016484348), (1, -0.050320480016..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.061...","[twelve, raids, twelve, clooney, brad, pitt, j...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[ocean, raids, box, office, ocean, crime, cape...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.09163007198059406), (1, -0.079074039232..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,...","[0.0, 0.032974084459105844, 0.0, 0.0, 0.0, 0.0...","[mongrel, jibe, hain, mongrel, rattled, opposi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[mongrel, jibe, michael, howard, claim, peter,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.2710264974800512), (1, 0.26200531828940..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[prepares, marr, resisted, recently., frantic,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[blair, poll, date, tony, blair, election, day...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.2141647407666234), (1, 0.20515532581798..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[quarter-final, ivan, croatian, halted, rain, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[henman, dubai, third, seed, tim, henman, sets...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.07130168878091067), (1, -0.062725925909..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[long-awaited, 25-year-old, murrayfield, bench...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[wilkinson, fit, edinburgh, england, captain, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.09549893061451255), (1, -0.105348418672..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[suitable, film-maker, lucas, revenge, violent...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[star, wars, children, star, wars, movie, chil...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.06145597297702567), (1, -0.045027374628..."


In [43]:
# Lets use LDA with normal cleaning of the text corpus (punctuation removal, stopword removal, etc.)

# create the LDA model
lda = models.LdaModel(corpus_tfidf, id2word=dictionary, num_topics=5) # initialize an LDA transformation
corpus_lda = lda[corpus_tfidf] # create a double wrapper over the original corpus: bow->tfidf->fold-in-lda

# print the topics
lda.print_topics(5) # both bow->tfidf and tfidf->lda transformations are actually executed here, on the fly

# Add the LDA topics to the dataframe
df['lda_1'] = [x for x in corpus_lda]

df.head(10)

Unnamed: 0,category,text,clean_text1,tfidf_1,clean_text2,tfidf_2,clean_text3,tfidf_3,lsi_1,lda_1
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030...","[plasma, tvs, radically, rooms, set-top, tivo,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.17381940749806418), (1, -0.097430843999...","[(0, 0.01531582), (1, 0.17132235), (2, 0.01530..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[bernie, overseeing, $11bn, witness, myers, co...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2767406767597...","[worldcom, boss, books, worldcom, boss, bernie...","[0.0, 0.0, 0.0, 0.04777469913386982, 0.0, 0.0,...","[(0, 0.09791059841518036), (1, 0.0166372555620...","[(0, 0.02477319), (1, 0.35160497), (2, 0.24446..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, wary, gamble, rushed, tigers, wells, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, farrell, gamble, leicester, bid, andy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.09746965699244897), (1, -0.061448447573...","[(0, 0.02625257), (1, 0.19509183), (2, 0.45252..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[arguably, slough, exeter, trafford, holders, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[newcastle, fa, cup, premiership, side, newcas...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.05383926016484348), (1, -0.050320480016...","[(0, 0.027747426), (1, 0.027089313), (2, 0.519..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.061...","[twelve, raids, twelve, clooney, brad, pitt, j...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[ocean, raids, box, office, ocean, crime, cape...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.09163007198059406), (1, -0.079074039232...","[(0, 0.020635178), (1, 0.5292641), (2, 0.40870..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,...","[0.0, 0.032974084459105844, 0.0, 0.0, 0.0, 0.0...","[mongrel, jibe, hain, mongrel, rattled, opposi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[mongrel, jibe, michael, howard, claim, peter,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.2710264974800512), (1, 0.26200531828940...","[(0, 0.017675225), (1, 0.01776712), (2, 0.0180..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[prepares, marr, resisted, recently., frantic,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[blair, poll, date, tony, blair, election, day...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.2141647407666234), (1, 0.20515532581798...","[(0, 0.022995805), (1, 0.023091806), (2, 0.022..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[quarter-final, ivan, croatian, halted, rain, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[henman, dubai, third, seed, tim, henman, sets...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.07130168878091067), (1, -0.062725925909...","[(0, 0.8887813), (1, 0.027784547), (2, 0.02833..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[long-awaited, 25-year-old, murrayfield, bench...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[wilkinson, fit, edinburgh, england, captain, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.09549893061451255), (1, -0.105348418672...","[(0, 0.02655201), (1, 0.024948364), (2, 0.3006..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[suitable, film-maker, lucas, revenge, violent...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[star, wars, children, star, wars, movie, chil...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[(0, 0.06145597297702567), (1, -0.045027374628...","[(0, 0.89501214), (1, 0.025576362), (2, 0.0275..."


### TF-IDF limited to nouns, noun phrases, and named entity recognition only.

In [32]:
# lets use spacy to lemmatize the text, only allow nouns, noun phrases and entity recognition
import spacy
nlp = spacy.load('en_core_web_md')

def lemmatize_text(text):
    doc = nlp(text)
    tokens = [token for token in doc if token.pos_ in ['NOUN', 'PROPN', 'NOUN']]
    return tokens

In [36]:
# create new clean_text3 column using the lemmatize_text function
df['clean_text3'] = df['text'].apply(lambda x: lemmatize_text(x))

df.head(10)

Unnamed: 0,category,text,clean_text1,tfidf_1,clean_text2,tfidf_2,clean_text3
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030...","[plasma, tvs, radically, rooms, set-top, tivo,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tv, future, hands, viewers, home, theatre, sy..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[bernie, overseeing, $11bn, witness, myers, co...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2767406767597...","[worldcom, boss, books, worldcom, boss, bernie..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, wary, gamble, rushed, tigers, wells, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, farrell, gamble, leicester, bid, andy..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[arguably, slough, exeter, trafford, holders, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[newcastle, fa, cup, premiership, side, newcas..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.061...","[twelve, raids, twelve, clooney, brad, pitt, j...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[ocean, raids, box, office, ocean, crime, cape..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,...","[0.0, 0.032974084459105844, 0.0, 0.0, 0.0, 0.0...","[mongrel, jibe, hain, mongrel, rattled, opposi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[mongrel, jibe, michael, howard, claim, peter,..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[prepares, marr, resisted, recently., frantic,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[blair, poll, date, tony, blair, election, day..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[quarter-final, ivan, croatian, halted, rain, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[henman, dubai, third, seed, tim, henman, sets..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[long-awaited, 25-year-old, murrayfield, bench...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[wilkinson, fit, edinburgh, england, captain, ..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[suitable, film-maker, lucas, revenge, violent...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[star, wars, children, star, wars, movie, chil..."


In [37]:
# create a vectorizer object
vectorizer_3 = TfidfVectorizer(stop_words='english', max_features= 2000, max_df = 0.5, smooth_idf=True)

# fit and transform the vectorizer on the text
X = vectorizer_3.fit_transform(df['clean_text3'].apply(lambda x: ' '.join([token.lemma_ for token in x])))

# add vectorizer to dataframe
df['tfidf_3'] = [x for x in X.toarray()]

df.head(10)

Unnamed: 0,category,text,clean_text1,tfidf_1,clean_text2,tfidf_2,clean_text3,tfidf_3
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030...","[plasma, tvs, radically, rooms, set-top, tivo,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[bernie, overseeing, $11bn, witness, myers, co...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2767406767597...","[worldcom, boss, books, worldcom, boss, bernie...","[0.0, 0.0, 0.0, 0.04777469913386982, 0.0, 0.0,..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, wary, gamble, rushed, tigers, wells, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, farrell, gamble, leicester, bid, andy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[arguably, slough, exeter, trafford, holders, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[newcastle, fa, cup, premiership, side, newcas...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.061...","[twelve, raids, twelve, clooney, brad, pitt, j...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[ocean, raids, box, office, ocean, crime, cape...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,...","[0.0, 0.032974084459105844, 0.0, 0.0, 0.0, 0.0...","[mongrel, jibe, hain, mongrel, rattled, opposi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[mongrel, jibe, michael, howard, claim, peter,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[prepares, marr, resisted, recently., frantic,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[blair, poll, date, tony, blair, election, day...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[quarter-final, ivan, croatian, halted, rain, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[henman, dubai, third, seed, tim, henman, sets...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[long-awaited, 25-year-old, murrayfield, bench...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[wilkinson, fit, edinburgh, england, captain, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[suitable, film-maker, lucas, revenge, violent...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[star, wars, children, star, wars, movie, chil...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
