In [10]:
import pandas as pd
import string, re
import numpy as np

In [11]:
# Read in the data
df = pd.read_csv('../Data/BBC-articles.csv')

In [5]:
df.head(10)

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
5,politics,howard hits back at mongrel jibe michael howar...
6,politics,blair prepares to name poll date tony blair is...
7,sport,henman hopes ended in dubai third seed tim hen...
8,sport,wilkinson fit to face edinburgh england captai...
9,entertainment,last star wars not for children the sixth an...


In [18]:
# lets use gensim to extract the most significant words from each article, lets use TF-IDF and LSID to extract
# the most significant words from each article
from gensim import corpora, models, similarities
from gensim.models import LsiModel
from gensim.models import TfidfModel

In [19]:
def clean_text(text):
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove whitespace
    text = text.strip()
    # lowercase
    text = text.lower()
    return text

# clean the text
df['clean_text'] = df['text'].apply(clean_text)

In [20]:
df.head(10)

Unnamed: 0,category,text,clean_text
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...
5,politics,howard hits back at mongrel jibe michael howar...,howard hits back at mongrel jibe michael howar...
6,politics,blair prepares to name poll date tony blair is...,blair prepares to name poll date tony blair is...
7,sport,henman hopes ended in dubai third seed tim hen...,henman hopes ended in dubai third seed tim hen...
8,sport,wilkinson fit to face edinburgh england captai...,wilkinson fit to face edinburgh england captai...
9,entertainment,last star wars not for children the sixth an...,last star wars not for children the sixth an...


In [24]:
# lets use tf-idf to extract the most significant words from each article
# create a dictionary of words
dictionary = corpora.Dictionary(df['clean_text'].str.split())
# create a corpus of words
corpus = [dictionary.doc2bow(text.split()) for text in df['clean_text']]
# create a tf-idf model
tfidf = TfidfModel(corpus)
# create a corpus of tf-idf weights
corpus_tfidf = tfidf[corpus]
# create a tf-idf model
lsi = LsiModel(corpus_tfidf, id2word=dictionary)
# create a corpus of lsi weights
corpus_lsi = lsi[corpus_tfidf]


In [25]:
# lets create a function to extract the most significant words from each article
def get_significant_words(corpus, dictionary, num_words=10):
    # create a list of the most significant words
    significant_words = []
    # for each article
    for article in corpus:
        # sort the words by their weight
        article = sorted(article, key=lambda x: x[1], reverse=True)
        # extract the most significant words
        article = [dictionary[word[0]] for word in article[:num_words]]
        # add the words to the list
        significant_words.append(article)
    return significant_words

In [26]:
# extract the most significant words from each article
significant_words = get_significant_words(corpus_lsi, dictionary)

# get the most significant words for each article
df['significant_words'] = significant_words

In [27]:
df.head(10)

Unnamed: 0,category,text,clean_text,significant_words
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,"[a, broadband, according, about, brand, bigges..."
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,"[as, bbc, be, are, because, available, a, bigg..."
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,"[a, here, abiding, kids, increasing, bbc, mr, ..."
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,"[digital, concern, diapers, end, communication..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,"[lcd, adam, months, biggest, a, of, its, model..."
5,politics,howard hits back at mongrel jibe michael howar...,howard hits back at mongrel jibe michael howar...,"[a, being, according, business, hours, how, al..."
6,politics,blair prepares to name poll date tony blair is...,blair prepares to name poll date tony blair is...,"[a, discuss, builtin, happens, everyone, inste..."
7,sport,henman hopes ended in dubai third seed tim hen...,henman hopes ended in dubai third seed tim hen...,"[europe, forget, a, everywhere, abiding, discu..."
8,sport,wilkinson fit to face edinburgh england captai...,wilkinson fit to face edinburgh england captai...,"[abiding, a, bbc, no, broadband, available, ge..."
9,entertainment,last star wars not for children the sixth an...,last star wars not for children the sixth an...,"[a, adam, means, jolna, abiding, leading, beca..."


In [28]:
# lets remove significant words that are common across all articles
# create a list of all significant words
significant_words = [word for article in df['significant_words'] for word in article]
# create a dataframe of the significant words
significant_words = pd.DataFrame(significant_words, columns=['significant_words'])

# get the count of each word
significant_words = significant_words.groupby('significant_words').size().reset_index(name='counts')

# get the most common words
significant_words = significant_words.sort_values('counts', ascending=False).head(10)

# get the most common words
significant_words = significant_words['significant_words'].tolist()

# remove the most common words
df['significant_words'] = df['significant_words'].apply(lambda x: [word for word in x if word not in significant_words])

In [30]:
df.head(10)

Unnamed: 0,category,text,clean_text,significant_words
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,"[broadband, brand, biggest, broadcast, and, co..."
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,"[as, bbc, be, because, available, biggest, dis..."
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,"[here, kids, increasing, bbc, mr, have, networ..."
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,"[digital, concern, diapers, end, communication..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,"[lcd, months, biggest, of, its, models, is, li..."
5,politics,howard hits back at mongrel jibe michael howar...,howard hits back at mongrel jibe michael howar...,"[being, business, hours, how, already, home, n..."
6,politics,blair prepares to name poll date tony blair is...,blair prepares to name poll date tony blair is...,"[discuss, builtin, happens, everyone, instead,..."
7,sport,henman hopes ended in dubai third seed tim hen...,henman hopes ended in dubai third seed tim hen...,"[europe, forget, everywhere, discuss, future, ..."
8,sport,wilkinson fit to face edinburgh england captai...,wilkinson fit to face edinburgh england captai...,"[bbc, no, broadband, available, getting, jolna..."
9,entertainment,last star wars not for children the sixth an...,last star wars not for children the sixth an...,"[means, jolna, leading, because, being, do]"


In [31]:
# lets remove words shorter than 4 characters
df['significant_words'] = df['significant_words'].apply(lambda x: [word for word in x if len(word) > 3])

df.head(10)

Unnamed: 0,category,text,clean_text,significant_words
0,tech,tv future in the hands of viewers with home th...,tv future in the hands of viewers with home th...,"[broadband, brand, biggest, broadcast, comfort..."
1,business,worldcom boss left books alone former worldc...,worldcom boss left books alone former worldc...,"[because, available, biggest, display, being]"
2,sport,tigers wary of farrell gamble leicester say ...,tigers wary of farrell gamble leicester say ...,"[here, kids, increasing, have, network, availa..."
3,sport,yeading face newcastle in fa cup premiership s...,yeading face newcastle in fa cup premiership s...,"[digital, concern, diapers, communications, co..."
4,entertainment,ocean s twelve raids box office ocean s twelve...,ocean s twelve raids box office ocean s twelve...,"[months, biggest, models, likes]"
5,politics,howard hits back at mongrel jibe michael howar...,howard hits back at mongrel jibe michael howar...,"[being, business, hours, already, home, networ..."
6,politics,blair prepares to name poll date tony blair is...,blair prepares to name poll date tony blair is...,"[discuss, builtin, happens, everyone, instead,..."
7,sport,henman hopes ended in dubai third seed tim hen...,henman hopes ended in dubai third seed tim hen...,"[europe, forget, everywhere, discuss, future, ..."
8,sport,wilkinson fit to face edinburgh england captai...,wilkinson fit to face edinburgh england captai...,"[broadband, available, getting, jolna, built]"
9,entertainment,last star wars not for children the sixth an...,last star wars not for children the sixth an...,"[means, jolna, leading, because, being]"
