In [None]:
import os
import nltk
import numpy as np
import pandas as pd
import sklearn
import operator
import random
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords 
from google.colab import drive
from sklearn.metrics import precision_score, recall_score, f1_score,accuracy_score
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
#set directory for categories
drive.mount('/content/drive')
category=['business', 'entertainment', 'politics', 'sport', 'tech',]
dataset_path='/content/drive/MyDrive/bbc/'

In [None]:
dataset_path=os.getcwd()+'/drive/MyDrive/bbc'
list_FilesName=os.listdir(dataset_path)
print(list_FilesName)

In [4]:
root_data=[]
for c in category:
  for file_name in os.listdir(dataset_path + "/" + c):
    file = open(dataset_path + "/" + c + "/" +file_name, "r")
    root_data.append([file.read(), c])
    file.close()
All_data = pd.DataFrame(root_data, columns=["content","category"])
All_data.shape
x_data = All_data.iloc[: , 0]
y_data = All_data.iloc[: , -1]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=1)
x_dev, x_test, y_dev, y_test = train_test_split(x_test, y_test, test_size=0.5, random_state=1)

In [6]:
stopwords=set(nltk.corpus.stopwords.words('english'))
stopwords.add(".")
stopwords.add(",")
stopwords.add("-")
stopwords.add("``")
stopwords.add("/")
stopwords.add("(")
stopwords.add(")")
stopwords.add("{")
stopwords.add("}")
stopwords.add("@")
stopwords.add("|")
stopwords.add(";")
stopwords.add("\n")
stopwords.add("#")
stopwords.add("+")
stopwords.add("_")
stopwords.add("''")
stopwords.add(":")
stopwords.add("%")
stopwords.add("$")
stopwords.add("&")
stopwords.add("'")

lemmatizer = nltk.stem.WordNetLemmatizer()

In [7]:
title_words_fre = {}
for article in x_train:
  sentences = nltk.tokenize.sent_tokenize(article)
  tokens = nltk.tokenize.word_tokenize(sentences[0])
  for token in tokens:
      word = lemmatizer.lemmatize(token).lower()
      if word in stopwords: continue
      if word in title_words_fre: title_words_fre[word] += 1
      else: title_words_fre[word] = 1
sorted_list = sorted(title_words_fre.items(), key = operator.itemgetter(1), reverse = True)

In [None]:
# use dev set to find a appropriate size of vocabulary
cur_size = 20
vocabulary = []
accuracy = 0
svm_clf_category = sklearn.svm.SVC(kernel="linear",gamma="auto")
while accuracy < 0.9:
    # generate vocabulary
    vocabulary = []
    for word,fre in sorted_list[:cur_size]:
        vocabulary.append(word)
    
    # prepare x and y for model fit
    x = []
    y = []
    for index in x_train.index:
        article = x_train[index]
        vector = np.zeros(len(vocabulary))
        words=[]
        sentences = nltk.tokenize.sent_tokenize(article)
        for sent in sentences:
            tokens = nltk.tokenize.word_tokenize(sent)
            for token in tokens:
              word = lemmatizer.lemmatize(token).lower()
              words.append(word)
        for i, word in enumerate(vocabulary):
            if word in words:
                vector[i] =words.count(word)
        x.append(vector)
        y.append(y_train[index])
    
    # fit model
    svm_clf_category.fit(np.asarray(x), np.asarray(y))
    
    # prepare dev set for predictions
    x = []
    y = []
    for index in x_dev.index:
        article = x_dev[index]
        vector = np.zeros(len(vocabulary))
        words=[]
        sentences = nltk.tokenize.sent_tokenize(article)
        for sent in sentences:
          tokens = nltk.tokenize.word_tokenize(sent)
          for token in tokens:
            word = lemmatizer.lemmatize(token).lower()
            words.append(word)
        for i, word in enumerate(vocabulary):
            if word in words:
                vector[i] =words.count(word)
        x.append(vector)
        y.append(y_dev[index])
    
    # predict accuracy
    prediction = svm_clf_category.predict(np.asarray(x))
    y = np.asarray(y)
    accuracy = accuracy_score(y, prediction)
    print("cur_size: " + str(cur_size) + ", accuracy: " + str(accuracy))
    
    # increase current size for better accuracy
    cur_size += 20

In [None]:
# use test set for final score
x = []
y = []
for index in x_test.index:
  article = x_test[index]
  vector = np.zeros(len(vocabulary))
  words=[]
  sentences = nltk.tokenize.sent_tokenize(article)
  for sent in sentences:
    tokens = nltk.tokenize.word_tokenize(sent)
    for token in tokens:
      word = lemmatizer.lemmatize(token).lower()
      words.append(word)
  for i, word in enumerate(vocabulary):
    if word in words:
      vector[i] =words.count(word)
  x.append(vector)
  y.append(y_test[index])
prediction = svm_clf_category.predict(np.asarray(x))
y = np.asarray(y)
print(precision_score(y, prediction, average='macro'))
print(recall_score(y, prediction, average='macro'))
print(f1_score(y, prediction, average='macro'))
print(accuracy_score(y, prediction))