In [1]:
!pip install nltk BeautifulSoup4



In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/admin/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/admin/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /Users/admin/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [3]:
import numpy as np
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import warnings

warnings.filterwarnings('ignore')

In [4]:
#load data
df = pd.DataFrame(columns=['content','class'])

In [5]:
from bs4 import UnicodeDammit

In [6]:
path = 'Data/bbc'
for directory in os.listdir(path):
    directory = os.path.join(path, directory)
    if os.path.isdir(directory):
        for filename in os.listdir(directory):
            filename = os.path.join(directory, filename)
            encoding = ''
            with open(filename, 'rb') as f:
                content = f.read()
                suggestion = UnicodeDammit(content)
                encoding = suggestion.original_encoding

            with open(filename, encoding=encoding) as f:
                content = f.read()
                current_df = pd.DataFrame({'content': [content], 'class': [os.path.basename(directory)]})
                df = df.append(current_df, ignore_index=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  2225 non-null   object
 1   class    2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [8]:
df.head()

Unnamed: 0,content,class
0,Musicians to tackle US red tape\n\nMusicians' ...,entertainment
1,"U2's desire to be number one\n\nU2, who have w...",entertainment
2,Rocker Doherty in on-stage fight\n\nRock singe...,entertainment
3,Snicket tops US box office chart\n\nThe film a...,entertainment
4,Ocean's Twelve raids box office\n\nOcean's Twe...,entertainment


In [9]:
df.isnull().any()

content    False
class      False
dtype: bool

In [10]:
df.duplicated().sum()

98

In [11]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

## Clean data

In [12]:
# remove HTTP link
df['content'] = df['content'].replace(
    r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', '',
    regex=True)

In [13]:
# Remove end of line characters

df['content'] = df['content'].replace(r'[\r\n]+', ' ', regex=True)

In [14]:
# Remove numbers only keep letter
df['content'] = df['content'].replace('[\w]*\d+[\w]*', '', regex=True)

In [15]:
# remove puntuation
df['content'] = df['content'].replace('[^\w\s]', '', regex=True)
punctuation = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
for char in punctuation:
    df['content'] = df['content'].replace(char, '')

In [16]:
# Remove multiple spaces 
df['content'] = df['content'].replace('[\s]{2,}', ' ', regex=True)

In [17]:
#Some lines with end spaces
df['content'] = df['content'].replace('[\s]{1,}$', '', regex=True)

In [18]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Remove stop words
def remove_stopwords(text):
    text_split = text.split()
    text = [word for word in text_split if word not in stop_words]
    return ' '.join(text)

In [19]:
df['content'] = df['content'].apply(remove_stopwords)

In [20]:
# Word net lemmatizer to get the root of word
lemmatizer = WordNetLemmatizer()

In [21]:
def get_wordnet_pos(treebank_tag):
    """
    return WORDNET POS compliance to WORDENT lemmatization (a,n,r,v)
    """
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        # As default pos in lemmatization is Noun
        return wordnet.NOUN

In [22]:
def lemmatize_text(text):
    lemmatized = []
    post_tag_list = pos_tag(word_tokenize(text))
    for word, post_tag_val in post_tag_list:
        lemmatized.append(lemmatizer.lemmatize(word, get_wordnet_pos(post_tag_val)))
    text = ' '.join(x for x in lemmatized)
    return text

In [23]:
df['content'] = df['content'].apply(lemmatize_text)

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

In [25]:
vector = CountVectorizer(max_features=5000)

In [26]:
vector.fit(df['content'])

In [27]:
X = vector.transform(df['content'])

In [28]:
df_new = pd.DataFrame(X.toarray(), columns=vector.get_feature_names())

In [29]:
pd.options.display.float_format = '{:,.2f}'.format

In [30]:
df_new.head()

Unnamed: 0,aaas,abandon,abbas,abc,ability,able,abn,abolish,abortion,about,...,youve,yuan,yugansk,yuganskneftegas,yukos,yushchenko,zealand,zero,zone,zurich
0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# Use DF-TDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
# Create the transform
vectorizer = TfidfVectorizer (ngram_range=(1, 2), min_df=0.02)

In [36]:
# Fit the data
vectorizer.fit (df ['content' ])

In [38]:
# transform the data
X = vectorizer.transform(df['content'])

In [42]:
df_new = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())

In [43]:
df_new.head()

Unnamed: 0,ability,able,absolutely,abuse,academy,accept,access,accord,according,account,...,wrong,year,year ago,year say,year the,yearold,yet,york,you,young
0,0.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.03,0.0,0.0,...,0.0,0.03,0.0,0.0,0.04,0.0,0.0,0.0,0.04,0.0
2,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0,0.0


In [None]:
# Create a list of words for each cluster
cluster_0 = df_new[df_new['cluster_'] == 0].drop('cluster_', axis=1).sum().sort_values(ascending=False)
cluster_1 = df_new[df_new['cluster_'] == 1].drop('cluster_', axis=1).sum().sort_values(ascending=False)
cluster_2 = df_new[df_new['cluster_'] == 2].drop('cluster_', axis=1).sum().sort_values(ascending=False)
cluster_3 = df_new[df_new['cluster_'] == 3].drop('cluster_', axis=1).sum().sort_values(ascending=False)
cluster_4 = df_new[df_new['cluster_'] == 4].drop('cluster_', axis=1).sum().sort_values(ascending=False)

In [None]:
# Create a word cloud for each cluster
wordcloud_0 = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(cluster_0)
wordcloud_1 = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(cluster_1)
wordcloud_2 = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(cluster_2)
wordcloud_3 = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(cluster_3)
wordcloud_4 = WordCloud(background_color='white', width=800, height=400).generate_from_frequencies(cluster_4)

In [None]:
# Plot the word cloud
plt.figure(figsize=(20, 10))
plt.subplot(231)
plt.imshow(wordcloud_0, interpolation='bilinear')
plt.axis('off')
plt.subplot(232)
plt.imshow(wordcloud_1, interpolation='bilinear')
plt.axis('off')
plt.subplot(233)
plt.imshow(wordcloud_2, interpolation='bilinear')
plt.axis('off')
plt.subplot(234)
plt.imshow(wordcloud_3, interpolation='bilinear')
plt.axis('off')
plt.subplot(235)
plt.imshow(wordcloud_4, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
# Draw a dendrogram
Z = linkage(df_new, 'ward')
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('sample index')
plt.ylabel('distance')
dendrogram(Z, leaf_rotation=90., leaf_font_size=8.)
plt.show()