In [61]:
import pandas as pd
import string, re
import numpy as np

# Reading and understanding the data

In [62]:
# Read in the data
df = pd.read_csv('../Data/BBC-articles.csv')

In [63]:
df.head(10)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
5,politics,howard hits back at mongrel jibe michael howar...
6,politics,blair prepares to name poll date tony blair is...
7,sport,henman hopes ended in dubai third seed tim hen...
8,sport,wilkinson fit to face edinburgh england captai...
9,entertainment,last star wars not for children the sixth an...


## You can do this using LSI/LSA and LDA algorithms, after vectorizing the text using TF-IDF vector in three different ways:

- TF-IDF after normal cleaning of the text corpus (punctuation removal, stopword removal, etc.)
- TF-IDF with term frequency filter, to exclude the top 10% of the most frequent words and words that appear less than 5 times in the documents.
- TF-IDF limited to nouns, noun phrases, and named entity recognition only.

In [64]:
## lets use nltk to extract the most significant words from each article to also use TF-IDF and LSI and LDA

from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

# stopwords from gensim
from gensim.parsing.preprocessing import STOPWORDS

[nltk_data] Downloading package stopwords to /home/nick/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nick/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/nick/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


### TF-IDF after normal cleaning of the text corpus (punctuation removal, stopword removal, etc.)

In [65]:
def normal_clean(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove whitespace
    text = text.strip()
    # remove stopwords with gensim
    text = [word for word in text.split() if word not in STOPWORDS]
    return text

In [66]:
# clean the text
df['clean_text1'] = df['text'].apply(normal_clean)

df.head(10)

Unnamed: 0,category,text,clean_text1
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war..."


In [67]:
# TF-IDF after normal cleaning of the text corpus (punctuation removal, stopword removal, etc.)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
# create a vectorizer object
vectorizer_1 = TfidfVectorizer(stop_words='english', max_features= 2000, max_df = 0.5, smooth_idf=True)

# fit and transform the vectorizer on the text
X = vectorizer_1.fit_transform(df['clean_text1'].apply(lambda x: ' '.join(x)))

In [70]:
# add vectorizer to dataframe
df['tfidf_1'] = [x for x in X.toarray()]
df.head(10)

Unnamed: 0,category,text,clean_text1,tfidf_1
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.061..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,...","[0.0, 0.032974084459105844, 0.0, 0.0, 0.0, 0.0..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


### TF-IDF with term frequency filter, to exclude the top 10% of the most frequent words and words that appear less than 5 times in the documents.

In [82]:
# TF-IDF with term frequency filtering, we will exluding the top 10% of terms with the highest term frequency for the entire corpus

word_frequencies = dict()

for text in df['text']:
    for word in text.split():
        if word not in word_frequencies.keys():
            word_frequencies[word] = 1
        else:
            word_frequencies[word] += 1


sorted(list(word_frequencies.items()), key=lambda x: x[1], reverse=True)[:10]

[('the', 52567),
 ('to', 24955),
 ('of', 19947),
 ('and', 18561),
 ('a', 18251),
 ('in', 17570),
 ('s', 9007),
 ('for', 8884),
 ('is', 8515),
 ('that', 8135)]

In [84]:
top_10_percent = int(len(word_frequencies) * 0.1)

In [85]:
# remove the top_10_percent words from the word_frequencies dictionary
for word in sorted(word_frequencies, key=word_frequencies.get, reverse=True)[:top_10_percent]:
    del word_frequencies[word]

sorted(list(word_frequencies.items()), key=lambda x: x[1], reverse=True)[:10]

[('profits.', 19),
 ('recovered', 19),
 ('strategist', 19),
 ('consortium', 19),
 ('avoided', 19),
 ('motor', 19),
 ('bank.', 19),
 ('hopeful', 19),
 ('christmas.', 19),
 ('attachment', 19)]

In [86]:
len(word_frequencies)

39394

In [91]:
# remove any item in word_frequencies that appears less than 5 times
for word in list(word_frequencies):
    if word_frequencies[word] < 5:
        del word_frequencies[word]

len(word_frequencies)

8022

In [92]:
# lets tokenize the text and only add words that are in the word_frequencies dictionary

def tokenize_text(text):
    tokens = text.split()
    tokens = [token for token in tokens if token in word_frequencies]

    return tokens


In [93]:
# create new clean_text2 column using the tokenize_text function
df['clean_text2'] = df['text'].apply(lambda x: tokenize_text(x))

In [94]:
df.head(10)

Unnamed: 0,category,text,clean_text1,tfidf_1,clean_text2
0,tech,tv future in the hands of viewers with home th...,"[tv, future, hands, viewers, home, theatre, sy...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.030...","[plasma, tvs, radically, rooms, set-top, tivo,..."
1,business,worldcom boss left books alone former worldc...,"[worldcom, boss, left, books, worldcom, boss, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[bernie, overseeing, $11bn, witness, myers, co..."
2,sport,tigers wary of farrell gamble leicester say ...,"[tigers, wary, farrell, gamble, leicester, rus...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[tigers, wary, gamble, rushed, tigers, wells, ..."
3,sport,yeading face newcastle in fa cup premiership s...,"[yeading, face, newcastle, fa, cup, premiershi...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[arguably, slough, exeter, trafford, holders, ..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,"[ocean, s, raids, box, office, ocean, s, crime...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.061...","[twelve, raids, twelve, clooney, brad, pitt, j..."
5,politics,howard hits back at mongrel jibe michael howar...,"[howard, hits, mongrel, jibe, michael, howard,...","[0.0, 0.032974084459105844, 0.0, 0.0, 0.0, 0.0...","[mongrel, jibe, hain, mongrel, rattled, opposi..."
6,politics,blair prepares to name poll date tony blair is...,"[blair, prepares, poll, date, tony, blair, lik...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[prepares, marr, resisted, recently., frantic,..."
7,sport,henman hopes ended in dubai third seed tim hen...,"[henman, hopes, ended, dubai, seed, tim, henma...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[quarter-final, ivan, croatian, halted, rain, ..."
8,sport,wilkinson fit to face edinburgh england captai...,"[wilkinson, fit, face, edinburgh, england, cap...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[long-awaited, 25-year-old, murrayfield, bench..."
9,entertainment,last star wars not for children the sixth an...,"[star, wars, children, sixth, final, star, war...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[suitable, film-maker, lucas, revenge, violent..."
