In [190]:
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import numpy as np
from sklearn.metrics import classification_report


In [171]:
df = pd.read_csv('bbc_data.csv')

In [172]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   data    2225 non-null   object
 1   labels  2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


In [173]:
df

Unnamed: 0,data,labels
0,Musicians to tackle US red tape Musicians gro...,entertainment
1,"U2s desire to be number one U2, who have won ...",entertainment
2,Rocker Doherty in on-stage fight Rock singer ...,entertainment
3,Snicket tops US box office chart The film ada...,entertainment
4,"Oceans Twelve raids box office Oceans Twelve,...",entertainment
...,...,...
2220,Warning over Windows Word files Writing a Mic...,tech
2221,Fast lifts rise into record books Two high-sp...,tech
2222,Nintendo adds media playing to DS Nintendo is...,tech
2223,Fast moving phone viruses appear Security fir...,tech


In [174]:
y = df['labels']
X = df['data']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Training Phase

Calculating the prior probability 

In [176]:
classes = y_train.unique()
y_train_df = pd.DataFrame(y_train)
counts = {}
for label in classes :
    counts[label] = len(y_train_df[y_train_df['labels'] == label])

print(counts)

{'politics': 337, 'tech': 321, 'business': 407, 'entertainment': 302, 'sport': 413}


In [177]:
class_probability = {}
total_nbr = sum(list(counts.values()))
for label in classes:
    class_probability[label] = counts[label] / total_nbr
print(class_probability)

{'politics': 0.18932584269662922, 'tech': 0.18033707865168538, 'business': 0.22865168539325842, 'entertainment': 0.1696629213483146, 'sport': 0.23202247191011235}


Calculating likelihoods given a class (Laplace add-1 included). Note that they are calculated on the training Dataset 

Regrouping text per class

In [178]:
text_per_class = {}
df_train = pd.concat([X_train,y_train],axis = 1)
for label in classes :
    text_per_class[label] = ''
for label in classes :
    text_class = df_train[df_train['labels']== label]
    texts = text_class['data']
    for text in texts:
        text_per_class[label] += str(text)




Remove punctuation and apply case folding 

In [179]:
for label in classes:
    
    text = text_per_class[label]
    text = re.sub(r'[^\w\s]',' ',text)
    text_per_class[label] = text.lower()
  

Tokens per class, frequences per class and probability per class.

In [180]:
# nltk.download('punkt')
tokens_per_class = {}
for label in classes :
    text = text_per_class[label]
    tokenized = word_tokenize(text)
    tokens_per_class[label] = tokenized


In [181]:
frequences_per_class = {label: {} for label in classes}

for label in classes:
    tokens = tokens_per_class[label]  
    frequences_per_class[label] = dict(Counter(tokens))  #
vocabulary = set()
for tokens in tokens_per_class.values():
    vocabulary.update(tokens)  

for label in classes:
    for token in vocabulary:
        if token in frequences_per_class[label]:
            frequences_per_class[label][token] += 1  
        else:
            frequences_per_class[label][token] = 1 



In [182]:
probability_per_class = {label: {} for label in classes}

total_frequencies_per_class = {label: sum(frequences_per_class[label].values()) for label in classes}

for label in classes:
    for token in frequences_per_class[label]:
        probability_per_class[label][token] = (np.log(
            frequences_per_class[label][token] / (total_frequencies_per_class[label] + len(vocabulary))
        ))



### Testing phase

Test set: Tokenization and unknown words removal

In [183]:
def filter_words_with_tokenizer(text):
    tokens = word_tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word in vocabulary]
    return filtered_tokens  

X_test = X_test.apply(filter_words_with_tokenizer)

print(X_test)

414     [china, now, top, trader, with, japan, china, ...
420     [bush, budget, seeks, deep, cutbacks, presiden...
1644    [mps, murder, sentence, concern, murder, sente...
416     [ge, sees, excellent, world, economy, us, behe...
1232    [rush, future, at, chester, uncertain, ian, fu...
                              ...                        
741     [japanese, banking, battle, at, an, end, japan...
205     [disputed, nirvana, box, set, on, sale, a, box...
1102    [isinbayeva, claims, new, world, best, pole, v...
668     [delta, cuts, fares, in, survival, plan, delta...
479     [water, firm, suez, in, argentina, row, a, con...
Name: data, Length: 445, dtype: object


In [184]:
df_test = pd.concat([X_test,y_test],axis = 1)

In [185]:
print(df_test)

                                                   data         labels
414   [china, now, top, trader, with, japan, china, ...       business
420   [bush, budget, seeks, deep, cutbacks, presiden...       business
1644  [mps, murder, sentence, concern, murder, sente...       politics
416   [ge, sees, excellent, world, economy, us, behe...       business
1232  [rush, future, at, chester, uncertain, ian, fu...          sport
...                                                 ...            ...
741   [japanese, banking, battle, at, an, end, japan...       business
205   [disputed, nirvana, box, set, on, sale, a, box...  entertainment
1102  [isinbayeva, claims, new, world, best, pole, v...          sport
668   [delta, cuts, fares, in, survival, plan, delta...       business
479   [water, firm, suez, in, argentina, row, a, con...       business

[445 rows x 2 columns]


In [186]:
predicted_class = {'predicted_class': []}

for tokens in X_test:
    probaclass = {label: 0 for label in classes}
    
    for label in classes:
        s = 0
        for token in tokens:
            s += probability_per_class[label].get(token, 0)
        probaclass[label] = s  
    
    predicted_label = max(probaclass, key=probaclass.get)
    predicted_class['predicted_class'].append(predicted_label)
predicted_class = pd.DataFrame(predicted_class)
print(predicted_class)


    predicted_class
0          business
1          business
2          politics
3          business
4             sport
..              ...
440        business
441   entertainment
442           sport
443        business
444        business

[445 rows x 1 columns]


In [187]:
predicted_class_df = pd.DataFrame(predicted_class)

predicted_class_df.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

final_test_set = pd.concat([predicted_class_df, df_test], axis=1)

print(final_test_set)


    predicted_class                                               data  \
0          business  [china, now, top, trader, with, japan, china, ...   
1          business  [bush, budget, seeks, deep, cutbacks, presiden...   
2          politics  [mps, murder, sentence, concern, murder, sente...   
3          business  [ge, sees, excellent, world, economy, us, behe...   
4             sport  [rush, future, at, chester, uncertain, ian, fu...   
..              ...                                                ...   
440        business  [japanese, banking, battle, at, an, end, japan...   
441   entertainment  [disputed, nirvana, box, set, on, sale, a, box...   
442           sport  [isinbayeva, claims, new, world, best, pole, v...   
443        business  [delta, cuts, fares, in, survival, plan, delta...   
444        business  [water, firm, suez, in, argentina, row, a, con...   

            labels  
0         business  
1         business  
2         politics  
3         business  
4     

### Evaluation

In [189]:
print(classification_report(final_test_set['labels'], final_test_set['predicted_class']))


               precision    recall  f1-score   support

     business       0.98      0.94      0.96       103
entertainment       1.00      0.95      0.98        84
     politics       0.93      0.99      0.96        80
        sport       1.00      0.99      0.99        98
         tech       0.95      1.00      0.98        80

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445

