In [7]:
#importing all the necessary library functions
import numpy as np
import nltk
import sklearn
import operator
import requests
import re
import unicodedata
import pandas as pd
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [1]:
#do not run this cell if you are not using the dataset from google drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
#reading the files names present in the folder
#please give the correct path of each folder
#geting all the filenames from the appropriate folder
import glob
import os
os.chdir(r'drive/My Drive/AML/ex2/business')
Business_file_names = glob.glob('*.txt')#getting the file names of all files present in the business folder
#once we use the "chdir" the current directory is changed to it (business folder in our case as we used it first in the above cell)
#so we use ".." to go back once 

In [3]:
os.chdir(r'../entertainment')#getting the file names of all files present in the entertainment folder
entertainment_file_names = glob.glob('*.txt')
os.chdir(r'../politics')#getting the file names of all files present in the politics folder
politics_file_names = glob.glob('*.txt')
os.chdir(r'../sport')#getting the file names of all files present in the sport folder
sports_file_names = glob.glob('*.txt')
os.chdir(r'../tech')#getting the file names of all files present in the tech folder
tech_file_names = glob.glob('*.txt')


In [4]:
#creating variables to store the respective text data present in the folder
#if you read the files from any other source then make sure the name_txt variables below contain all the content of all txt files in a folder
#e.g: the variable business_txt should contain all the content present in all txt files from business folder
business_txt=""
entertainment_txt=""
politics_txt=""
sport_txt=""
tech_txt=""

In [5]:
#reading text from all the files in business folder.
for name in Business_file_names:
  f=open("/content/drive/My Drive/AML/ex2/business/"+name,"r")#give the path for business folder
  business_txt=business_txt+"\n"+f.read()
  f.close()
print("completed reading from business folder")
#reading text from all the files in entertainment folder.
for name in entertainment_file_names:
  f=open("/content/drive/My Drive/AML/ex2/entertainment/"+name,"r")#give the path for entertainment folder
  entertainment_txt=entertainment_txt+"\n"+f.read()
  f.close()
print("completed reading from entertainment folder")
#reading text from all the files in politics folder.
for name in politics_file_names:
  f=open("/content/drive/My Drive/AML/ex2/politics/"+name,"r")#give the path for politics folder
  politics_txt=politics_txt+"\n"+f.read()
  f.close()
print("completed reading from politics folder")
#reading text from all the files in sport folder.
for name in sports_file_names:
  f=open("/content/drive/My Drive/AML/ex2/sport/"+name,"r")#give the path for politics folder
  try: #getting an exception due "£" symbol so using try catch
    sport_txt=sport_txt+"\n"+f.read()
  except UnicodeDecodeError:
    pass
  f.close()
print("completed reading from sport folder")
#reading text from all the files in tech folder.
for name in tech_file_names:
  f=open("/content/drive/My Drive/AML/ex2/tech/"+name,"r")#give the path for tech folder
  tech_txt=tech_txt+"\n"+f.read()
  f.close()
print("completed reading from tech folder")

completed reading from business folder
completed reading from entertainment folder
completed reading from politics folder
completed reading from sport folder
completed reading from tech folder


In [8]:
#stopwords are used to remove irrelevant words from the list
# First, we get the stopwords list from nltk
stopwords=set(nltk.corpus.stopwords.words('english'))
# We can add more words to the stopword list, like punctuation marks
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")
stopwords.add(" ")
stopwords.add("'.'")

In [9]:
#creating a function to tokenize the words
#reference from "https://towardsdatascience.com/from-dataframe-to-n-grams-e34e29df3460"
def tokenwords(text):
  wnl = nltk.stem.WordNetLemmatizer()
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]#we are removing all the stopwords before tokenizing


In [10]:
words = tokenwords(business_txt)#tokenizing the words from business_txt
business_words = pd.Series(nltk.ngrams(words, 1))# adding unigrams to business words
business_words = business_words.append(pd.Series(nltk.ngrams(words, 2)))#adding bigrams to business words

In [11]:
#tokenizing and adding unigrams and bigrams for entertainment_txt
words = tokenwords(entertainment_txt)
entertainment_words = pd.Series(nltk.ngrams(words, 1))
entertainment_words = entertainment_words.append(pd.Series(nltk.ngrams(words, 2)))

In [12]:
#tokenizing and adding unigrams and bigrams for politics_txt
words = tokenwords(politics_txt)
politics_words = pd.Series(nltk.ngrams(words, 1))
politics_words = politics_words.append(pd.Series(nltk.ngrams(words, 2)))

In [13]:
#tokenizing and adding unigrams and bigrams for sport_txt
words = tokenwords(sport_txt)
sport_words = pd.Series(nltk.ngrams(words, 1))
sport_words = sport_words.append(pd.Series(nltk.ngrams(words, 2)))

In [14]:
#tokenizing and adding unigrams and bigrams for tech_txt
words = tokenwords(tech_txt)
tech_words = pd.Series(nltk.ngrams(words, 1))
tech_words = tech_words.append(pd.Series(nltk.ngrams(words, 2)))

In [16]:
#creating a list that only holds nouns, pronouns, adverbs, preposition, adjective, conjuction, possive endings, interjection, foreign word(pos)
business_words_pos=[]
entertainment_words_pos=[]
politics_words_pos=[]
sport_words_pos=[]
tech_words_pos=[]

In [17]:
#reference from https://www.guru99.com/pos-tagging-chunking-nltk.html
#{NN, NNS} = noun, {NNP,NNPS}= proper noun, {VB,VBG,VBD,VBN,VBP,VBZ} = verb, {RB,RBR,RBS}=adverb ,PRP = pronoun PRP$= Possive pronoun, POS = possive ending, IN=preposition
#{JJ, JJR, JJS} = adjective, CC = conjunction, FW= foreign word, UH=interjection, 
for word in business_words:
  if nltk.pos_tag(word)[0][1] == 'NN' or 'NNS' or 'VB' or 'VBG' or 'VBD' or 'VBN' or 'VBP' or 'VBZ' or 'RB' or 'RBR' or 'RBS' or 'PRP' or 'PRP$' or 'POS' or 'JJ' or 'JJR' or 'JJS' or 'CC' or 'FW' or 'UH':
    business_words_pos.append(word)

In [18]:
for word in entertainment_words:
  if nltk.pos_tag(word)[0][1] == 'NN' or 'NNS' or 'VB' or 'VBG' or 'VBD' or 'VBN' or 'VBP' or 'VBZ' or 'RB' or 'RBR' or 'RBS' or 'PRP' or 'PRP$' or 'POS' or 'JJ' or 'JJR' or 'JJS' or 'CC' or 'FW' or 'UH':
    entertainment_words_pos.append(word)
for word in politics_words:
  if nltk.pos_tag(word)[0][1] == 'NN' or 'NNS' or 'VB' or 'VBG' or 'VBD' or 'VBN' or 'VBP' or 'VBZ' or 'RB' or 'RBR' or 'RBS' or 'PRP' or 'PRP$' or 'POS' or 'JJ' or 'JJR' or 'JJS' or 'CC' or 'FW' or 'UH':
    politics_words_pos.append(word)
for word in sport_words:
  if nltk.pos_tag(word)[0][1] == 'NN' or 'NNS' or 'VB' or 'VBG' or 'VBD' or 'VBN' or 'VBP' or 'VBZ' or 'RB' or 'RBR' or 'RBS' or 'PRP' or 'PRP$' or 'POS' or 'JJ' or 'JJR' or 'JJS' or 'CC' or 'FW' or 'UH':
    sport_words_pos.append(word)
for word in tech_words:
  if nltk.pos_tag(word)[0][1] == 'NN' or 'NNS' or 'VB' or 'VBG' or 'VBD' or 'VBN' or 'VBP' or 'VBZ' or 'RB' or 'RBR' or 'RBS' or 'PRP' or 'PRP$' or 'POS' or 'JJ' or 'JJR' or 'JJS' or 'CC' or 'FW' or 'UH':
    tech_words_pos.append(word)

In [19]:
#cleaning the tokens
#now our dataset contains values like this "('word',)" and we need to be like "word"
temp=[]
a=[]
b=[]
c=[]
for word in business_words_pos:
  temp.append(str(word).replace("', '"," "))
for word in temp:
  a.append(str(word).replace("'",""))
for word in a:
  b.append(str(word).replace("(",""))
for word in b:
  c.append(str(word).replace(")",""))
business_words_pos=[]
for word in c:
  business_words_pos.append(str(word).replace(",",""))
#--------------------------------------------------------------------------------------------
temp=[]
a=[]
b=[]
c=[]
for word in entertainment_words_pos:
  temp.append(str(word).replace("', '"," "))
for word in temp:
  a.append(str(word).replace("'",""))
for word in a:
  b.append(str(word).replace("(",""))
for word in b:
  c.append(str(word).replace(")",""))
entertainment_words_pos=[]
for word in c:
  entertainment_words_pos.append(str(word).replace(",",""))
#-----------------------------------------------------------------
temp=[]
a=[]
b=[]
c=[]
for word in politics_words_pos:
  temp.append(str(word).replace("', '"," "))
for word in temp:
  a.append(str(word).replace("'",""))
for word in a:
  b.append(str(word).replace("(",""))
for word in b:
  c.append(str(word).replace(")",""))
politics_words_pos=[]
for word in c:
  politics_words_pos.append(str(word).replace(",",""))
#---------------------------------------------------------------
temp=[]
a=[]
b=[]
c=[]
for word in sport_words_pos:
  temp.append(str(word).replace("', '"," "))
for word in temp:
  a.append(str(word).replace("'",""))
for word in a:
  b.append(str(word).replace("(",""))
for word in b:
  c.append(str(word).replace(")",""))
sport_words_pos=[]
for word in c:
  sport_words_pos.append(str(word).replace(",",""))
#----------------------------------------------------------------
temp=[]
a=[]
b=[]
c=[]
for word in tech_words_pos:
  temp.append(str(word).replace("', '"," "))
for word in temp:
  a.append(str(word).replace("'",""))
for word in a:
  b.append(str(word).replace("(",""))
for word in b:
  c.append(str(word).replace(")",""))
tech_words_pos=[]
for word in c:
  tech_words_pos.append(str(word).replace(",",""))

In [20]:
# Now we create a frequency dictionary with all words in the dataset
dict_word_frequency={}

for word in business_words_pos:
  #if word in stopwords: continue #since we already removed the stopwords we do not need this
  if word not in dict_word_frequency: dict_word_frequency[word]=1
  else: dict_word_frequency[word]+=1

for word in entertainment_words_pos:
  #if word in stopwords: continue
  if word not in dict_word_frequency: dict_word_frequency[word]=1
  else: dict_word_frequency[word]+=1

for word in politics_words_pos:
  #if word in stopwords: continue
  if word not in dict_word_frequency: dict_word_frequency[word]=1
  else: dict_word_frequency[word]+=1

for word in sport_words_pos:
  #if word in stopwords: continue
  if word not in dict_word_frequency: dict_word_frequency[word]=1
  else: dict_word_frequency[word]+=1

for word in tech_words_pos:
  #if word in stopwords: continue
  if word not in dict_word_frequency: dict_word_frequency[word]=1
  else: dict_word_frequency[word]+=1

In [21]:
# Now we create a sorted frequency list with the top 2000 words, using the function "sorted". Let's see the 15 most frequent words
#only using 2000 words to make the code run faster. give more words if in a high spec pc
sorted_list = sorted(dict_word_frequency.items(), key=operator.itemgetter(1), reverse=True)[:2000] #no.of words
i=0
for word,frequency in sorted_list[:15]:
  i+=1
  print (str(i)+". "+str(word)+" - "+str(frequency))
print(len(sorted_list))
# Finally, we create our vocabulary based on the sorted frequency list 
vocabulary=[]
for word,frequency in sorted_list:
  vocabulary.append(word)

1. The - 8012
2. said - 7243
3. I - 3251
4. Mr - 3004
5. year - 2763
6. would - 2571
7. also - 2102
8. people - 1923
9. But - 1787
10. It - 1639
11. US - 1568
12. He - 1556
13. one - 1535
14. new - 1510
15. could - 1504
2000


In [22]:
#function to transform sentences to vectors
def get_vector_text(list_vocab,string):
  vector_text=np.zeros(len(list_vocab))
  list_tokens_string=tokenwords(string)
  for i, word in enumerate(list_vocab):
    if word in list_tokens_string:
      vector_text[i]=list_tokens_string.count(word)
  return vector_text

In [23]:
business_txt=business_txt.split("\n")
entertainment_txt=entertainment_txt.split("\n")
politics_txt=politics_txt.split("\n")
sport_txt=sport_txt.split("\n")
tech_txt=tech_txt.split("\n")

In [24]:
#creating x and y training sets, x contains the vectors and y contains the labels
#the vocabulary is made using the features "frequency", "n-grams(unigram and bigram)" and Parts of speech(pos)
X_train=[]
Y_train=[]
for business_words in business_txt:
  business_review=get_vector_text(vocabulary,business_words)
  X_train.append(business_review)
  Y_train.append(0)

for entertainment_words in entertainment_txt:
  entertainment_review=get_vector_text(vocabulary,entertainment_words)
  X_train.append(entertainment_review)
  Y_train.append(1)

for politics_words in politics_txt:
  politics_review=get_vector_text(vocabulary,politics_words)
  X_train.append(politics_review)
  Y_train.append(2)

for sport_words in sport_txt:
  sport_review=get_vector_text(vocabulary,sport_words)
  X_train.append(sport_review)
  Y_train.append(3)

for tech_words in tech_txt:
  tech_review=get_vector_text(vocabulary,tech_words)
  X_train.append(tech_review)
  Y_train.append(4)

In [25]:
#importing chi2 and selecting 100 best features
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

In [43]:
x_train_array = np.asarray(X_train)
Y = np.asarray(Y_train)

In [37]:
#only selecting 100 best features to make the code run faster, the result achieved might be lower than usual
# to get a better result increase the value of k below
analysis=SelectKBest(chi2, k=100).fit(x_train_array,Y)
X = analysis.transform(x_train_array)

In [38]:
#using test train split to split the data into 80% training set and 20%testing set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y, train_size = 0.8, random_state = 1)

In [39]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [40]:
#using svc classfire and crossvalidation
#might take a long time depending on the pc specs
#took about 3hrs 30 min in google colab for 2000 features
from sklearn.model_selection import cross_val_score
from sklearn import svm
clf = svm.SVC(kernel='linear', C=1, gamma='scale')
clf.fit(X,Y)
scores = cross_val_score(clf, X, Y, cv=5)


In [41]:
pre = clf.predict(x_test)
print( confusion_matrix(y_test, pre))


[[ 385    7   29  662   18]
 [  21  286   12  511    4]
 [  30    7  385  631    8]
 [   5   10   10 1101   10]
 [  26   17   10  586  379]]


In [42]:
print(classification_report(y_test, pre))

              precision    recall  f1-score   support

           0       0.82      0.35      0.49      1101
           1       0.87      0.34      0.49       834
           2       0.86      0.36      0.51      1061
           3       0.32      0.97      0.48      1136
           4       0.90      0.37      0.53      1018

    accuracy                           0.49      5150
   macro avg       0.76      0.48      0.50      5150
weighted avg       0.74      0.49      0.50      5150

