In [116]:
#Data preprocessing libraries
import pandas as pd  #Data manipulation
import string  #Remove punctuation & characters
import nltk  #Natural language processing 
import pickle  #For loading saved models and vectorizers
import re


from nltk.corpus import stopwords  #Stop word removal
from nltk.tokenize import word_tokenize  #Tokenizition
#from nltk.stem import PorterStemmer  #Stemming
from nltk.stem import WordNetLemmatizer  #Import WordNetLemmatizer
from nltk.corpus import wordnet  #Import WordNet

#Feature extractions libraries
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

#Models libraries
from sklearn.model_selection import train_test_split #For data splitting
#Model Evaluation Function
from sklearn.metrics import accuracy_score, classification_report  #Import metrics
from sklearn.svm import SVC  #SVM Model
from sklearn.naive_bayes import MultinomialNB #Naive Bayes Model -  MultinomialNB 
from sklearn.ensemble import RandomForestClassifier #Random Forest Model

In [117]:
#Read excel file
file_path = r"C:\Users\HUAWEI\Downloads\bbc-text 2227 - Copy.csv"
data = pd.read_csv(file_path, delimiter=';')
text = data['text']  #get text column
category = data['category']  #get category column

In [118]:
#Data pre-processing
#Normalization
#Remove unwanted characters
text = text.str.replace(f'[{string.punctuation}]', '', regex=True)
#convert text column into lower case 
text = text.str.lower()

#Tokenization
def tokenize_text(text):
    return nltk.word_tokenize(text)

#Apply the tokenization function to the text column
text['tokens'] = text.apply(tokenize_text) #Store the tokens in a new column
tokens = text['tokens']

#Define stop words for English
stop_words = set(stopwords.words('english'))

#Stop word removal
def remove_stop_words(tokens):
    return [word for word in tokens if word not in stop_words]

#Apply the function to the tokens column
tokens = tokens.apply(remove_stop_words)
tokens

#Initialize the WordNet lemmatizer
wn = WordNetLemmatizer()

#Function to map Part-of-speech (POS) tags to WordNet tags
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ  #Adjective
    elif treebank_tag.startswith('V'):
        return wordnet.VERB  #Verb
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN  #Noun
    elif treebank_tag.startswith('R'):
        return wordnet.ADV  #Adverb
    else:
        return wordnet.NOUN  #Default to noun

#Define a function for lemmatizing words
def lemmatizing(tokens):  #Accept a list of tokens
    pos_tag = nltk.pos_tag(tokens)  #Get POS tags for the tokens
    return [wn.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tag]  #Return lemmatized words


data['lemmatized'] = tokens.apply(lemmatizing)  #Store the lemmatized tokens in a new column

print(data['lemmatized'])

0       [tv, future, hand, viewer, home, theatre, syst...
1       [worldcom, bos, leave, book, alone, former, wo...
2       [tiger, wary, farrell, gamble, leicester, say,...
3       [yeading, face, newcastle, fa, cup, premiershi...
4       [ocean, twelve, raid, box, office, ocean, twel...
                              ...                        
2220    [car, pull, u, retail, figure, u, retail, sale...
2221    [kilroy, unveils, immigration, policy, exchats...
2222    [rem, announce, new, glasgow, concert, u, band...
2223    [political, squabble, snowball, become, common...
2224    [souness, delight, euro, progress, bos, graeme...
Name: lemmatized, Length: 2225, dtype: object


In [119]:
data[['text', 'lemmatized']].sample(5)

Unnamed: 0,text,lemmatized
917,russia gets investment blessing soaring oil sa...,"[russia, get, investment, blessing, soar, oil,..."
1150,intel unveils laser breakthrough intel has sai...,"[intel, unveils, laser, breakthrough, intel, s..."
395,holmes is hit by hamstring injury kelly holmes...,"[holmes, hit, hamstring, injury, kelly, holmes..."
412,blair labour s longest-serving pm tony blair h...,"[blair, labour, longestserving, pm, tony, blai..."
1450,boateng to step down at election paul boateng ...,"[boateng, step, election, paul, boateng, chief..."


In [120]:
#End of 1st step (data cleaning)

In [121]:
#Try sparate Extractors - Algorithms
#Convert lemmatized lists to strings for vectorization
data['lemmatized_text'] = data['lemmatized'].apply(lambda x: ' '.join(x)) #To convert the texts in the “lemmatized” column into single texts by joining the words using spaces.

#Bag of Words (BoW)
bow_vectorizer = CountVectorizer(min_df=5) #to ignore words that appear in less than 5 titles.
bow_matrix = bow_vectorizer.fit_transform(data['lemmatized_text']) #bow_matrix - contains the number of occurrences of each word.
#Fit_transform - will be applied to the transformed texts to form a matrix.

#Convert to DataFrame to display the data more clearly
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=bow_vectorizer.get_feature_names_out())
print("Bag of Words (BoW) Matrix:")
print(bow_df)

#TF-IDF
tfidf_vectorizer = TfidfVectorizer(min_df=5)#Ignore rare words
tfidf_matrix = tfidf_vectorizer.fit_transform(data['lemmatized_text'])#tfidf_matrix-Represents the weight of each word based on its frequency.
#Fit_transform-To convert texts to a matrix.

#Convert to DataFrame to display the data more clearly
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
print("\nTF-IDF Matrix:")
print(tfidf_df)

Bag of Words (BoW) Matrix:
      00  000  000m  000strong  01  02  03  04  05  06  ...  zach  zealand  \
0      0    1     0          0   0   0   0   0   0   0  ...     0        0   
1      0    1     0          0   0   0   0   0   0   0  ...     0        0   
2      0    0     0          0   0   0   0   0   0   0  ...     0        0   
3      0    0     0          0   0   0   0   0   0   0  ...     0        0   
4      0    0     0          0   0   0   0   0   0   0  ...     0        0   
...   ..  ...   ...        ...  ..  ..  ..  ..  ..  ..  ...   ...      ...   
2220   0    0     0          0   1   0   4   0   1   2  ...     0        0   
2221   0    2     0          0   0   0   0   0   0   0  ...     0        0   
2222   0    1     0          0   0   0   0   0   0   0  ...     0        0   
2223   0    1     0          0   0   0   0   0   0   0  ...     0        0   
2224   0    0     0          0   0   0   0   0   0   0  ...     0        0   

      zeppelin  zero  zhang  zimbabw

In [122]:
#Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, category, test_size=0.2, random_state=42)  #Split data

#Function to evaluate model performance
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)  #Make predictions
    accuracy = accuracy_score(y_test, predictions)  #Calculate accuracy
    report = classification_report(y_test, predictions)  #Generate classification report
    return accuracy, report



svm_model = SVC(kernel='linear', class_weight='balanced')  #Initialize SVM model with linear kernel and class weights
svm_model.fit(X_train, y_train)  #Train the model
svm_accuracy, svm_report = evaluate_model(svm_model, X_test, y_test)  #Evaluate SVM
print("SVM Accuracy:", svm_accuracy)  #Print accuracy
print("SVM Classification Report:\n", svm_report)  #Print classification report


nb_model = MultinomialNB()  #Initialize Naive Bayes model
nb_model.fit(X_train, y_train)  #Train the model
nb_accuracy, nb_report = evaluate_model(nb_model, X_test, y_test)  #Evaluate Naive Bayes
print("Naive Bayes Accuracy:", nb_accuracy)  #Print accuracy
print("Naive Bayes Classification Report:\n", nb_report)  #Print classification report


rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)  #Initialize Random Forest model with class weights
rf_model.fit(X_train, y_train)  #Train the model
rf_accuracy, rf_report = evaluate_model(rf_model, X_test, y_test)  #Evaluate Random Forest
print("Random Forest Accuracy:", rf_accuracy)  #Print accuracy
print("Random Forest Classification Report:\n", rf_report)  #Print classification report

SVM Accuracy: 0.9730337078651685
SVM Classification Report:
                precision    recall  f1-score   support

     business       0.98      0.92      0.95       101
entertainment       0.98      0.99      0.98        81
     politics       0.94      0.99      0.96        83
        sport       0.98      1.00      0.99        98
         tech       0.99      0.98      0.98        82

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445

Naive Bayes Accuracy: 0.9662921348314607
Naive Bayes Classification Report:
                precision    recall  f1-score   support

     business       0.96      0.96      0.96       101
entertainment       1.00      0.90      0.95        81
     politics       0.93      0.98      0.95        83
        sport       0.99      1.00      0.99        98
         tech       0.95      0.99      0.97        82

     accuracy                   

In [123]:
import pickle

# Load the trained model and TF-IDF vectorizer
with open("model_bbc.pickle", "rb") as model_file:
    lg_model = pickle.load(model_file)

with open("tfidf_file.pickle", "rb") as tfidf_file:
    tfidf_vectorizer = pickle.load(tfidf_file)


def predict_category(text):
    cleaned_text = preprocess_data(text)
    tfidf_data = tfidf_vectorizer.transform([cleaned_text])
    prediction = lg_model.predict(tfidf_data)
    
    category_map = {0: 'politics', 1: 'sport', 2: 'tech', 3: 'entertainment', 4: 'business'}
    return category_map[prediction[0]]

new_text = "video games "
predicted_category = predict_category(new_text)
print(f"The predicted category is: {predicted_category}")

The predicted category is: tech


In [137]:
#Test the model 
#Load the trained model and TF-IDF vectorizer from pickle files
with open("model_bbc.pickle", "rb") as model_file:
    lg_model = pickle.load(model_file)  #Load the logistic regression model

with open("tfidf_file.pickle", "rb") as tfidf_file:
    tfidf_vectorizer = pickle.load(tfidf_file)  #Load the TF-IDF vectorizer

#Function to predict the category of the input text
def predict_category(text):
    #Preprocess the input text
    cleaned_text = preprocess_data(text)
    #Convert the cleaned text to TF-IDF features
    tfidf_data = tfidf_vectorizer.transform([cleaned_text])
    #Make the prediction using the loaded model
    prediction = lg_model.predict(tfidf_data)
    
    #Map numerical predictions to category names
    category_map = {0: 'politics', 1: 'sport', 2: 'tech', 3: 'entertainment', 4: 'business'}
    #Return the predicted category
    return category_map[prediction[0]]

In [141]:
#Example usage of the prediction function
new_text = "how political squabbles snowball it s become commonplace to argue that blair and brown are like squabbling school kids and that they (and their supporters) need to grow up and stop bickering.  but this analysis in fact gets it wrong. it s not just children who fight - adults do too. and there are solid reasons why even a trivial argument between mature protagonists can be hard to stop once its got going. the key feature of an endless feud is that everyone can agree they d be better off if it ended - but everyone wants to have the last word.  each participant genuinely wants the row to stop  but thinks it worth prolonging the argument just a tiny bit to ensure their view is heard. their successive attempts to end the argument with their last word ensure the argument goes on and on and on. (in the case of mr blair and mr brown  successive books are published  ensuring the issues never die.) now this isn t because the participants are stupid - it s actually each individual behaving entirely rationally  given the incentives facing them. indeed  there s even a piece of economic theory that explains all this. nothing as obscure as  post-neo-classical endogenous growth theory  which the chancellor himself once quoted - but a ubiquitous piece of game theory which all respectable policy wonks are familiar with.  it s often referred to as the  prisoner s dilemma   based on a parable much told in economics degree courses... about a sheriff and two prisoners. the story goes that two prisoners are jointly charged with a heinous crime  and are locked up in separate cells. but the sheriff desperately needs a confession from at least one of them  to provide enough evidence to convict them of the crime. without a confession  the prisoners will get a minimal sentence on some trumped up charge.  clearly the prisoners  best strategy is to keep their mouths shut  and take the short sentence  but the clever sheriff has an idea to induce them to talk. he tells each prisoner separately  that if they confess - and they are the only one to confess - they ll be let off their crime. and he tells them that if they don t confess - and they are the only one not to confess - they ll get life. now  if you are prisoner confronted with this choice  your best bet is to confess. if your partner doesn t confess  you ll get off completely. and if your partner does confess  you d better confess to ensure you don t get life. the result is of course  both prisoners confess  so the sheriff does not have to let either one off. both prisoners  individual logic was to behave that way  even though both would have been better if they had somehow agreed to shut up. don t worry if you don t entirely follow it - you can to look it up on google  where there are 283 000 entries on it.  the prisoners  dilemma and all its ramifications have truly captured economists in the last couple of decades. it is a parable used to describe any situation where there is an obvious sensible choice to be taken collectively  but where the only rational choice individually is to behave selfishly.  a cold war arms race for example - a classic case where both russia and america would be better off with just a few arms  rather than a lot of arms. but as long as each wants just a few more arms than the other  an arms race ensues with the results that the individually logical decision to buy more arms  results in arms levels that are too high. what economics tells us is that once you re in a prisoners  dilemma - unless you are repeating the experience many times over - it s hard to escape the perverse logic of it. it s no good just exhorting people to stop buying arms  or to stop arguing when all their incentives encourage them to carry on. somehow  the incentives have to change.  in the case of the labour party  if you believe the rift between blair and brown camps is as bad as the reports suggest  solomon s wisdom needs to be deployed to solve the problem. every parent knows there are ingenious solutions to arguments  solutions which affect the incentives of the participants. an example  is the famous rule that  one divides  the other chooses  as a way of allocating a piece of cake to be sliced up between greedy children. in the case of an apparently endless argument  if you want it to come to an end  you have to ensure the person who has the last word is one who loses rather than the one who wins the row. the cost of prolonging the row by even one more briefing  or one more book for that matter  has to exceed the benefit of having the last word  and getting your point in. if the rest of the party can enforce that  they ll have the protagonists retreating pretty quickly."
 
#Input text for prediction
predicted_category = predict_category(new_text)  #Get the predicted category
#Output the predicted category
print(f"The predicted category is: {predicted_category}")

The predicted category is: politics
