In [112]:
#Data preprocessing libraries
import pandas as pd  #Data manipulation
import string  #Remove punctuation & characters
import nltk  #Natural language processing 

from nltk.corpus import stopwords  #Stop word removal
from nltk.tokenize import word_tokenize  #Tokenizition
#from nltk.stem import PorterStemmer  #Stemming
from nltk.stem import WordNetLemmatizer  #Import WordNetLemmatizer
from nltk.corpus import wordnet  #Import WordNet

#Feature extractions libraries
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#Models libraries
from sklearn.model_selection import train_test_split #For data splitting
#Model Evaluation Function
from sklearn.metrics import accuracy_score, classification_report  #Import metrics
from sklearn.svm import SVC  #SVM Model
from sklearn.naive_bayes import MultinomialNB #Naive Bayes Model -  MultinomialNB 
from sklearn.ensemble import RandomForestClassifier #Random Forest Model

In [114]:
#Read excel file
file_path = r"C:\Users\HUAWEI\Downloads\bbc-text 2227 - Copy.csv"
data = pd.read_csv(file_path, delimiter=';')
text = data['text']  #get text column
category = data['category']  #get category column

In [120]:
#Normalization
#Remove unwanted characters
text = text.str.replace(f'[{string.punctuation}]', '', regex=True)
#convert text column into lower case 
text = text.str.lower()

In [122]:
#Tokenization
def tokenize_text(text):
    return nltk.word_tokenize(text)

#Apply the tokenization function to the text column
text['tokens'] = text.apply(tokenize_text) #Store the tokens in a new column
tokens = text['tokens']
tokens

0       [tv, future, in, the, hands, of, viewers, with...
1       [worldcom, boss, left, books, alone, former, w...
2       [tigers, wary, of, farrell, gamble, leicester,...
3       [yeading, face, newcastle, in, fa, cup, premie...
4       [ocean, s, twelve, raids, box, office, ocean, ...
                              ...                        
2220    [cars, pull, down, us, retail, figures, us, re...
2221    [kilroy, unveils, immigration, policy, exchats...
2222    [rem, announce, new, glasgow, concert, us, ban...
2223    [how, political, squabbles, snowball, it, s, b...
2224    [souness, delight, at, euro, progress, boss, g...
Name: text, Length: 2225, dtype: object

In [123]:
#Define stop words for English
stop_words = set(stopwords.words('english'))

#Stop word removal
def remove_stop_words(tokens):
    return [word for word in tokens if word not in stop_words]

#Apply the function to the tokens column
tokens = tokens.apply(remove_stop_words)
tokens

0       [tv, future, hands, viewers, home, theatre, sy...
1       [worldcom, boss, left, books, alone, former, w...
2       [tigers, wary, farrell, gamble, leicester, say...
3       [yeading, face, newcastle, fa, cup, premiershi...
4       [ocean, twelve, raids, box, office, ocean, twe...
                              ...                        
2220    [cars, pull, us, retail, figures, us, retail, ...
2221    [kilroy, unveils, immigration, policy, exchats...
2222    [rem, announce, new, glasgow, concert, us, ban...
2223    [political, squabbles, snowball, become, commo...
2224    [souness, delight, euro, progress, boss, graem...
Name: text, Length: 2225, dtype: object

In [124]:
#Initialize the WordNet lemmatizer
wn = WordNetLemmatizer()

#Function to map Part-of-speech (POS) tags to WordNet tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ  #Adjective
    elif treebank_tag.startswith('V'):
        return wordnet.VERB  #Verb
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN  #Noun
    elif treebank_tag.startswith('R'):
        return wordnet.ADV  #Adverb
    else:
        return wordnet.NOUN  #Default to noun

#Define a function for lemmatizing words
def lemmatizing(tokens):  #Accept a list of tokens
    pos_tag = nltk.pos_tag(tokens)  #Get POS tags for the tokens
    return [wn.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tag]  #Return lemmatized words


data['lemmatized'] = tokens.apply(lemmatizing)  #Store the lemmatized tokens in a new column

print(data['lemmatized'])

0       [tv, future, hand, viewer, home, theatre, syst...
1       [worldcom, bos, leave, book, alone, former, wo...
2       [tiger, wary, farrell, gamble, leicester, say,...
3       [yeading, face, newcastle, fa, cup, premiershi...
4       [ocean, twelve, raid, box, office, ocean, twel...
                              ...                        
2220    [car, pull, u, retail, figure, u, retail, sale...
2221    [kilroy, unveils, immigration, policy, exchats...
2222    [rem, announce, new, glasgow, concert, u, band...
2223    [political, squabble, snowball, become, common...
2224    [souness, delight, euro, progress, bos, graeme...
Name: lemmatized, Length: 2225, dtype: object


In [125]:
data[['text', 'lemmatized']].sample(5)

Unnamed: 0,text,lemmatized
2148,standard life concern at lse bid standard life...,"[standard, life, concern, lse, bid, standard, ..."
1750,connors rallying cry for british tennis do y...,"[connors, rally, cry, british, tennis, heart, ..."
1649,us trade gap ballooned in october the us trade...,"[u, trade, gap, balloon, october, u, trade, de..."
1783,radcliffe will compete in london paula radclif...,"[radcliffe, compete, london, paula, radcliffe,..."
230,tv show unites angolan families angolan famili...,"[tv, show, unite, angolan, family, angolan, fa..."


In [126]:
#End of 1st step (data cleaning)

In [128]:
#Try sparate Extractors - Algorithms
#Convert lemmatized lists to strings for vectorization
data['lemmatized_text'] = data['lemmatized'].apply(lambda x: ' '.join(x))  #Join list into a string

#Bag of Words (BoW)
bow_vectorizer = CountVectorizer(min_df=5)
bow_matrix = bow_vectorizer.fit_transform(data['lemmatized_text'])

#Convert to DataFrame for better visualization
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
print("Bag of Words (BoW) Matrix:")
print(bow_df)

#TF-IDF
tfidf_vectorizer = TfidfVectorizer(min_df=5)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['lemmatized_text'])

#Convert to DataFrame for better visualization
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(tfidf_df)

Bag of Words (BoW) Matrix:
      00  000  000m  000strong  01  02  03  04  05  06  ...  zach  zealand  \
0      0    1     0          0   0   0   0   0   0   0  ...     0        0   
1      0    1     0          0   0   0   0   0   0   0  ...     0        0   
2      0    0     0          0   0   0   0   0   0   0  ...     0        0   
3      0    0     0          0   0   0   0   0   0   0  ...     0        0   
4      0    0     0          0   0   0   0   0   0   0  ...     0        0   
...   ..  ...   ...        ...  ..  ..  ..  ..  ..  ..  ...   ...      ...   
2220   0    0     0          0   1   0   4   0   1   2  ...     0        0   
2221   0    2     0          0   0   0   0   0   0   0  ...     0        0   
2222   0    1     0          0   0   0   0   0   0   0  ...     0        0   
2223   0    1     0          0   0   0   0   0   0   0  ...     0        0   
2224   0    0     0          0   0   0   0   0   0   0  ...     0        0   

      zeppelin  zero  zhang  zimbabw

In [130]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, category, test_size=0.2, random_state=42)  #Split data

def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)  #Make predictions
    accuracy = accuracy_score(y_test, predictions)  #Calculate accuracy
    report = classification_report(y_test, predictions)  #Generate classification report
    return accuracy, report



svm_model = SVC(kernel='linear', class_weight='balanced')  #Initialize SVM model with linear kernel and class weights
svm_model.fit(X_train, y_train)  #Train the model
svm_accuracy, svm_report = evaluate_model(svm_model, X_test, y_test)  #Evaluate SVM
print("SVM Accuracy:", svm_accuracy)  #Print accuracy
print("SVM Classification Report:\n", svm_report)  #Print classification report


nb_model = MultinomialNB()  #Initialize Naive Bayes model
nb_model.fit(X_train, y_train)  #Train the model
nb_accuracy, nb_report = evaluate_model(nb_model, X_test, y_test)  #Evaluate Naive Bayes
print("Naive Bayes Accuracy:", nb_accuracy)  #Print accuracy
print("Naive Bayes Classification Report:\n", nb_report)  #Print classification report


rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)  #Initialize Random Forest model with class weights
rf_model.fit(X_train, y_train)  #Train the model
rf_accuracy, rf_report = evaluate_model(rf_model, X_test, y_test)  #Evaluate Random Forest
print("Random Forest Accuracy:", rf_accuracy)  #Print accuracy
print("Random Forest Classification Report:\n", rf_report)  #Print classification report

SVM Accuracy: 0.9730337078651685
SVM Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.92      0.95       101
entertainment       0.98      0.99      0.98        81
     politics       0.94      0.99      0.96        83
        sport       0.98      1.00      0.99        98
         tech       0.99      0.98      0.98        82

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445

Naive Bayes Accuracy: 0.9662921348314607
Naive Bayes Classification Report:
                precision    recall  f1-score   support

     business       0.96      0.96      0.96       101
entertainment       1.00      0.90      0.95        81
     politics       0.93      0.98      0.95        83
        sport       0.99      1.00      0.99        98
         tech       0.95      0.99      0.97        82

     accuracy                   

In [138]:
print(X_test)

  (0, 384)	0.04036462024073686
  (0, 394)	0.0460197312091258
  (0, 455)	0.04383451775992278
  (0, 508)	0.04046503264223666
  (0, 531)	0.20438654638565862
  (0, 546)	0.11285119313176119
  (0, 566)	0.04735248198400006
  (0, 567)	0.04453603901686298
  (0, 591)	0.06103113050462963
  (0, 611)	0.045167879097887866
  (0, 654)	0.08328681587016164
  (0, 693)	0.033235075513500745
  (0, 737)	0.05941532057558684
  (0, 747)	0.051632859067424194
  (0, 759)	0.030552163886798992
  (0, 799)	0.035626820036627004
  (0, 839)	0.0662013169543016
  (0, 850)	0.03100236381646142
  (0, 857)	0.043743657983674834
  (0, 904)	0.05941532057558684
  (0, 919)	0.06374091770856664
  (0, 964)	0.027148671204709436
  (0, 970)	0.05335485038347907
  (0, 985)	0.02759917584260959
  (0, 1005)	0.07124355326599958
  :	:
  (444, 5805)	0.05397950988658272
  (444, 5863)	0.0484986753706583
  (444, 5873)	0.05167779110528446
  (444, 5953)	0.04876553384010303
  (444, 6015)	0.04043633033444995
  (444, 6235)	0.0571837599471461
  (444, 635

In [140]:
print(y_test)

414          politics
420          business
1644    entertainment
416              tech
1232            sport
            ...      
741          business
205          business
1102         business
668          business
479          business
Name: category, Length: 445, dtype: object
