In [1]:
import nltk
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import math
import random
import pandas as pd
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [2]:
data = pd.read_csv('BBC News Train.csv')
print(data.head())

   ArticleId                                               Text  Category
0       1833  worldcom ex-boss launches defence lawyers defe...  business
1        154  german business confidence slides german busin...  business
2       1101  bbc poll indicates economic gloom citizens in ...  business
3       1976  lifestyle  governs mobile choice  faster  bett...      tech
4        917  enron bosses in $168m payout eighteen former e...  business


In [3]:
data.shape

(1490, 3)

In [4]:
def lowercase(string):
    return string.lower()
def remove_punc(string):
    txt = re.sub(r'[^\w\s]', '', string)
    return txt
cachedStopWords = stopwords.words("english")
def remove_stopwords(data):
    words = word_tokenize(str(data))
    res = ' '.join([word for word in words if word not in cachedStopWords])
    return np.char.strip(res)
def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return np.char.strip(new_text)
def remove_punc(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [11]:
def preprocessing(d):
    no_punc = []
    d_lower=d.lower()
    lemmatizer = WordNetLemmatizer()
    nltk_tokens = nltk.word_tokenize(d_lower)
    nltk_tokens = remove_punc(nltk_tokens)
    
    stop_words_removed = []
    for w in nltk_tokens:
        if w not in cachedStopWords:
            stop_words_removed.append(w)
    new_words = []
    for x in stop_words_removed:
        if(x.isalnum() and x!=" "):
            new_words.append(x)
    new_text = []
    for w in new_words:
        new_text.append(lemmatizer.lemmatize(w))
    return new_text

In [5]:
for i in range(len(data)):
    text = data['Text'][i]
    text = lowercase(text)
    text = remove_punc(text)
    text = remove_stopwords(text)
    text = lemmatization(text)
    data['Text'][i] = text

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Text'][i] = text


In [7]:
print(data.head())

   ArticleId                                               Text  Category
0       1833  worldcom ex bos launch defence lawyer defendin...  business
1        154  german business confidence slide german busine...  business
2       1101  bbc poll indicates economic gloom citizen majo...  business
3       1976  lifestyle governs mobile choice faster better ...      tech
4        917  enron boss 168m payout eighteen former enron d...  business


In [10]:
business_class = []
entertainment_class = []
politics_class = []
sport_class = []
tech_class  =[]

for i in range(len(data)):
  if(data['Category'][i]=="business"):
    business_class.append(data['Text'][i])
  if(data['Category'][i]==("entertainment")):
    entertainment_class.append(data['Text'][i])
  if(data['Category'][i]==("politics")):
    politics_class.append(data['Text'][i])
  if(data['Category'][i]==("sport")):
    sport_class.append(data['Text'][i])
  if(data['Category'][i]==("tech")):
    tech_class.append(data['Text'][i])


In [12]:
bcl_pre = []
ecl_pre = []
pcl_pre = []
scl_pre = []
tcl_pre = []
for f in business_class:
  bcl_pre.append(preprocessing(str(f)))
for f in entertainment_class:
  ecl_pre.append(preprocessing(str(f)))
for f in politics_class:
  pcl_pre.append(preprocessing(str(f)))
for f in sport_class:
  scl_pre.append(preprocessing(str(f)))
for f in tech_class:
  tcl_pre.append(preprocessing(str(f)))

In [13]:
all_data = []
all_data.append(bcl_pre)
all_data.append(ecl_pre)
all_data.append(pcl_pre)
all_data.append(scl_pre)
all_data.append(tcl_pre)

In [14]:
def get_tf_icf_list(train_test_lists):
    tf_list_of_dict = []
    for c in range(5):
        tf_list_of_dict.append({})
    cf_dict = {}
    icf_dict = {}
    for c in range(5):
        for doc_terms in train_test_lists[c][0]:
            for term in doc_terms:
                if(term in tf_list_of_dict[c]):
                    tf_list_of_dict[c][term]+=1
                else:
                    tf_list_of_dict[c][term]=1
    
    for tf_dict_class_c in tf_list_of_dict:
        for term in tf_dict_class_c:
            if(term in cf_dict):
                cf_dict[term]+=1
            else:
                cf_dict[term]=1
    
    for term in cf_dict:
        icf_dict[term] = math.log2(5/cf_dict[term])
    
    tf_icf_list_of_dict = []
    for c in range(5):
        tf_icf_list_of_dict.append({})
    for c in range(5):
        for term in tf_list_of_dict[c]:
            tf_icf_list_of_dict[c][term] = tf_list_of_dict[c][term]*icf_dict[term]
    
    return tf_list_of_dict, icf_dict, tf_icf_list_of_dict

In [15]:
def Q3(train_test_lists, num_of_classes):
    # tf-icf
    tf_list_of_dict, icf_dict, tf_icf_list_of_dict = get_tf_icf_list(train_test_lists)
    list_of_vocab_class_c = []
    for c in range(num_of_classes):
        vocab_class_c = {}
        for tup in tf_icf_list_of_dict[c].items():
            vocab_class_c[tup[0]]=tup[1]
        list_of_vocab_class_c.append(vocab_class_c)

    # global vocabulary
    global_vocab = {}
    for c in range(num_of_classes):
        global_vocab = global_vocab | list_of_vocab_class_c[c].keys()

    # calculate priors
    num_docs_class_c = []
    total_docs = 0
    for c in range(num_of_classes):
        total_docs+=len(train_test_lists[c][0])
        num_docs_class_c.append(len(train_test_lists[c][0]))
    prior_class_c = [x/total_docs for x in num_docs_class_c]
    
    num_terms_class_c_vocab_c = []
    for c in range(num_of_classes):
        summation=0
        for term in global_vocab:
            if(term in tf_icf_list_of_dict[c].keys()):
                summation+=tf_list_of_dict[c][term]
        num_terms_class_c_vocab_c.append(summation)
    
    # testing
    confusion_matrix = np.zeros((num_of_classes,num_of_classes))
    total_predictions = 0
    for c in range(num_of_classes):
        for test_doc_tokens in train_test_lists[c][1]:
            posterior_class_c = []
            for cc in range(num_of_classes):
                posterior_class_c.append(prior_class_c[cc])
            for term in test_doc_tokens:
                for class_c in range(num_of_classes):
                    if(term not in global_vocab):
                        continue
                    temp = 1.0
                    if(term in list_of_vocab_class_c[class_c]):
                        temp = (1 + tf_list_of_dict[class_c][term])/(len(global_vocab) + num_terms_class_c_vocab_c[class_c])
                    else:
                        temp = 1/(len(global_vocab) + num_terms_class_c_vocab_c[class_c])
                    posterior_class_c[class_c] = posterior_class_c[class_c]*temp
            predicted_class = posterior_class_c.index(max(posterior_class_c))
            total_predictions+=1
            confusion_matrix[c][predicted_class]+=1
    print(confusion_matrix)
    
    return 100*np.trace(confusion_matrix)/total_predictions,confusion_matrix

In [16]:
def train_test_split_function(train_frac, list_of_files_class_c):
    random.shuffle(list_of_files_class_c)
    train_size = int(train_frac*len(list_of_files_class_c))
    train_list_class_c = list_of_files_class_c[:train_size]
    test_list_class_c = list_of_files_class_c[train_size:]
    return train_list_class_c, test_list_class_c

In [17]:
train_fractions = [0.7, 0.6, 0.8]
for train_frac in train_fractions:
        train_test_lists = []
        for class_num in range(5):
            train_list_class_c, test_list_class_c = train_test_split_function(train_frac, all_data[class_num])
            train_test_lists.append([train_list_class_c, test_list_class_c])
        print()
        accuracy,cm = Q3(train_test_lists, 5)
        print('Accuracy for train split fraction ',train_frac, ' is ',accuracy,'%')
        precision = sum([cm[i][i] / sum(cm[:,i]) if sum(cm[:,i]) != 0 else 0 for i in range(len(cm))]) / len(cm)
        print('Precision for train split fraction ',train_frac, ' is ',precision*100,'%')
        recall = sum([cm[i][i] / sum(cm[i,:]) if sum(cm[i,:]) != 0 else 0 for i in range(len(cm))]) / len(cm)
        print('Recall for train split fraction ',train_frac, ' is ',recall*100,'%')
        f1_score = 2 * precision * recall / (precision + recall)
        print('F1_score for train split fraction ',train_frac, ' is ',f1_score*100,'%')


[[101.   0.   0.   0.   0.]
 [ 81.   1.   0.   0.   0.]
 [ 79.   0.   4.   0.   0.]
 [ 89.   0.   0.  15.   0.]
 [ 79.   0.   0.   0.   0.]]
Accuracy for train split fraction  0.7  is  26.948775055679288 %
Precision for train split fraction  0.7  is  64.70862470862471 %
Recall for train split fraction  0.7  is  24.09237324532652 %
F1_score for train split fraction  0.7  is  35.11186528512605 %

[[135.   0.   0.   0.   0.]
 [106.   4.   0.   0.   0.]
 [107.   0.   3.   0.   0.]
 [120.   0.   0.  19.   0.]
 [105.   0.   0.   0.   0.]]
Accuracy for train split fraction  0.6  is  26.87813021702838 %
Precision for train split fraction  0.6  is  64.71204188481676 %
Recall for train split fraction  0.6  is  24.006540222367562 %
F1_score for train split fraction  0.6  is  35.02112408655337 %

[[68.  0.  0.  0.  0.]
 [54.  1.  0.  0.  0.]
 [51.  0.  4.  0.  0.]
 [61.  0.  0.  9.  0.]
 [53.  0.  0.  0.  0.]]
Accuracy for train split fraction  0.8  is  27.242524916943523 %
Precision for train sp

In [20]:
from sklearn.model_selection import train_test_split
training_data, testing_data = train_test_split(data, test_size=0.3)
X_train, X_test, y_train, y_test = train_test_split(data, data['Category'], test_size=0.3, stratify=data['Category'])

In [21]:
X_train_new = []
for i in X_train['Text']:
  i = str(i)
  X_train_new.append(i)
X_test_new = []
for i in X_test['Text']:
  i = str(i)
  X_test_new.append(i)

In [22]:
y_train_new = []
for i in y_train:
  if i == 'business':
    y_train_new.append(0)
  if i == 'entertainment':
    y_train_new.append(1)
  if i == 'politics':
    y_train_new.append(2)
  if i == 'sport':
    y_train_new.append(3)
  if i == 'tech':
    y_train_new.append(4)

y_test_new = []
for i in y_test:
  if i == 'business':
    y_test_new.append(0)
  if i == 'entertainment':
    y_test_new.append(1)
  if i == 'politics':
    y_test_new.append(2)
  if i == 'sport':
    y_test_new.append(3)
  if i == 'tech':
    y_test_new.append(4)
  

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

class NGramClassifier:
    def __init__(self, ngram_range=(1, 2)):
        self.clf = Pipeline([
            ('vect', CountVectorizer(ngram_range=ngram_range)),
            ('clf', MultinomialNB())
        ])
        
    def train(self, X_train, y_train):
        self.clf.fit(X_train, y_train)
    
    def predict(self, X_test):
        return self.clf.predict(X_test)

In [24]:
clf = NGramClassifier()
clf.train(X_train_new, y_train_new)
y_pred = clf.predict(X_test_new)
print(y_pred)

[2 1 1 2 4 3 0 2 1 4 4 2 3 1 1 4 3 2 4 3 2 2 1 4 2 0 0 3 1 4 3 2 4 3 2 0 0
 2 0 4 1 4 3 1 1 1 3 1 1 3 1 3 0 2 0 4 2 2 3 3 4 3 2 2 0 2 4 0 1 0 1 4 3 2
 2 3 1 1 3 2 4 0 3 0 0 1 3 2 4 1 3 3 0 2 4 0 1 4 0 3 3 0 0 0 3 2 1 4 0 0 0
 3 1 4 2 4 1 2 1 3 4 3 3 3 0 4 3 1 1 3 0 2 2 3 1 1 2 0 2 1 2 0 0 2 3 4 0 1
 3 1 0 0 3 1 0 2 0 4 0 1 2 3 2 4 3 0 3 4 3 2 1 1 1 0 3 2 2 0 2 0 4 4 4 2 2
 0 1 3 0 0 2 1 0 0 4 2 3 0 4 4 4 1 1 3 2 0 1 0 3 2 2 4 3 4 0 3 3 3 1 2 4 1
 3 2 3 2 0 4 3 3 4 0 4 1 2 3 3 4 4 4 4 1 2 1 1 4 1 3 4 0 4 0 1 4 2 0 2 1 0
 3 0 3 3 4 3 4 0 1 4 2 4 0 0 0 2 4 1 1 2 0 3 0 3 2 0 3 0 2 1 4 2 3 0 4 2 0
 3 2 0 2 3 3 0 2 3 1 0 3 3 1 0 4 2 3 1 2 1 3 4 3 1 3 4 0 3 0 2 0 2 1 1 0 3
 2 3 3 3 1 1 1 0 4 3 3 2 2 0 3 0 1 0 2 0 3 2 4 1 3 4 4 0 4 4 0 3 4 0 2 2 3
 3 2 1 0 4 2 4 0 0 3 1 4 2 0 4 3 3 3 2 3 4 0 0 3 0 3 0 4 1 3 4 3 3 4 1 4 3
 3 3 3 0 1 0 0 4 2 4 0 0 0 4 4 2 1 3 0 1 4 4 2 3 1 4 3 0 0 0 1 1 1 4 0 0 2
 2 4 1]


In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# assuming y_true is an array of true labels and y_pred is an array of predicted labels
accuracy = accuracy_score(y_test_new, y_pred)
precision = precision_score(y_test_new, y_pred,average='micro')
recall = recall_score(y_test_new, y_pred,average='micro')
f1 = f1_score(y_test_new, y_pred,average='micro')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.9821029082774049
Precision: 0.9821029082774049
Recall: 0.9821029082774049
F1 score: 0.9821029082774049
