In [38]:
import pandas as pd

In [39]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Please download the dataset from the zip folder and upload it to your drive and change the path location below.


In [40]:
df = pd.read_csv("/content/drive/MyDrive/Project Files/bbc-text.csv")  # Put your path location of the dataset here
df

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...
...,...,...
2220,business,cars pull down us retail figures us retail sal...
2221,politics,kilroy unveils immigration policy ex-chatshow ...
2222,entertainment,rem announce new glasgow concert us band rem h...
2223,politics,how political squabbles snowball it s become c...


In [41]:
# Checking the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   category  2225 non-null   object
 1   text      2225 non-null   object
dtypes: object(2)
memory usage: 34.9+ KB


## Preprocessing:

In [42]:
# Separating the target variable:
articles = df['text']
y = df['category']

In [43]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
nltk.download('punkt_tab')
def preprocess_text(text):

  text = text.lower() #Lowercased
  tokens = word_tokenize(text) #Tokenized
  tokens = [word for word in tokens if word.isalpha() and word not in stop_words] # Removing Stopwords and punctuations
  tokens = [lemmatizer.lemmatize(word) for word in tokens]

  return (tokens)

df['processed_text'] = df['text'].apply(preprocess_text)
df['processed_text']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unnamed: 0,processed_text
0,"[tv, future, hand, viewer, home, theatre, syst..."
1,"[worldcom, bos, left, book, alone, former, wor..."
2,"[tiger, wary, farrell, gamble, leicester, say,..."
3,"[yeading, face, newcastle, fa, cup, premiershi..."
4,"[ocean, twelve, raid, box, office, ocean, twel..."
...,...
2220,"[car, pull, u, retail, figure, u, retail, sale..."
2221,"[kilroy, unveils, immigration, policy, host, r..."
2222,"[rem, announce, new, glasgow, concert, u, band..."
2223,"[political, squabble, snowball, become, common..."


# Vectorizing the text:

We will first build a model using the bag of words vectorizer and set and check it's accuracy on the dev set. We will then use another vectorizer method to see if the accuracy could be further improved by changing the vectorizers.

### Bag of Words Vectorizer:

In [45]:
from gensim.models import Word2Vec

# Define parameters and train the model in one step
w2v_model = Word2Vec(sentences = df['processed_text'],  # Tokenized sentences
                    vector_size=150,           # Sets the dimensionality of each vectors. Higher dimensionality stores more information.
                    window=1,                  # Context window size (The model will consider 5 words on either side of the target word)
                    min_count=2,               # Minimum frequency for a word to be included
                    workers=4,                 # Parallel processing with 4 cores
                    epochs=20)                  # Number of Iterations



In [46]:
# Bag of Words Vectorization:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer with a maximum of 1000 features
vectorizer = CountVectorizer(max_features=1000)

# Fit and transform the articles
X_bow = vectorizer.fit_transform(articles)

In [47]:
# Bag of Words for bigrams
from sklearn.feature_extraction.text import CountVectorizer

ngram_vectorizer = CountVectorizer(ngram_range=(2, 2), max_features=1000)
ngram_bow_vectors = ngram_vectorizer.fit_transform(articles)

In [48]:
# Combine BoW, N-grams, and Word2Vec features into a single feature matrix

from scipy.sparse import hstack
word2vec_features = np.array([np.mean([w2v_model.wv[word] for word in words if word in w2v_model.wv]
                                      or [np.zeros(w2v_model.vector_size)], axis=0)
                              for words in df['processed_text']])

# Combine features using hstack for sparse matrices
X_combined = hstack([X_bow, ngram_bow_vectors , word2vec_features])

In [49]:
from sklearn.model_selection import train_test_split

# First split: Train + Dev vs Test
X_temp, X_test, y_temp, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

# Second split: Train vs Dev
X_train, X_dev, y_train, y_dev = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

In [50]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the logistic regression model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)

# Training the model on the training data
logreg_model.fit(X_train, y_train)


# Make predictions on the development set
y_dev_pred = logreg_model.predict(X_dev)

# Evaluate the model on the development set
dev_accuracy = accuracy_score(y_dev, y_dev_pred)
print(f"Development Set Accuracy: {dev_accuracy}")

# Get a classification report for Dev set:
print("Classification Report (Development Set):")
print(classification_report(y_dev, y_dev_pred))

Development Set Accuracy: 0.950561797752809
Classification Report (Development Set):
               precision    recall  f1-score   support

     business       0.92      0.97      0.94       102
entertainment       0.95      0.94      0.94        77
     politics       0.97      0.89      0.93        83
        sport       0.98      1.00      0.99       102
         tech       0.94      0.94      0.94        81

     accuracy                           0.95       445
    macro avg       0.95      0.95      0.95       445
 weighted avg       0.95      0.95      0.95       445



We will now try building a model with TF-IDF vectorizer in place of Bag of Words and see if the accuracy improves.

## TFIDF Vectorization: (Using TF-IDF to try to improve accuracy)

In [None]:
# TFIDF Vectorization:

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(articles)

In [None]:
# Ngrams Vectorization:
ngram_vectorizer = TfidfVectorizer(ngram_range=(2, 2), max_features=1000) # bigrams
ngram_vectors  = vectorizer.fit_transform(articles)


In [None]:
from scipy.sparse import hstack
word2vec_features = np.array([np.mean([w2v_model.wv[word] for word in words if word in w2v_model.wv]
                                      or [np.zeros(w2v_model.vector_size)], axis=0)
                              for words in df['processed_text']])

# Combine features using hstack for sparse matrices (TF-IDF and N-grams are sparse)
X_combined = hstack([X_tfidf, ngram_vectors, word2vec_features])

In [None]:
from sklearn.model_selection import train_test_split

# First split: Train + Dev vs Test
X_temp, X_test, y_temp, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42, stratify=y)

# Second split: Train vs Dev
X_train, X_dev, y_train, y_dev = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

print("Training set size:", X_train.shape)
print("Development set size:", X_dev.shape)
print("Test set size:", X_test.shape)


Training set size: (1335, 2150)
Development set size: (445, 2150)
Test set size: (445, 2150)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the logistic regression model
logreg_model = LogisticRegression(max_iter=1000, random_state=42)

# Training the model
logreg_model.fit(X_train, y_train)


In [None]:
# Make predictions on the development set
y_dev_pred = logreg_model.predict(X_dev)

# Evaluate the model on the development set
dev_accuracy = accuracy_score(y_dev, y_dev_pred)
print(f"Development Set Accuracy: {dev_accuracy}")

# Get a classification report for Dev set:
print("Classification Report (Development Set):")
print(classification_report(y_dev, y_dev_pred))


Development Set Accuracy: 0.9685393258426966
Classification Report (Development Set):
               precision    recall  f1-score   support

     business       0.93      0.98      0.96       102
entertainment       0.96      0.96      0.96        77
     politics       0.99      0.94      0.96        83
        sport       1.00      0.99      1.00       102
         tech       0.96      0.96      0.96        81

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445



The accuracy Improved from 0.95 to 0.965 while Using TF-IDF as a vectorizer in place of Bag of Words.

Now that we have finalized the better vectorizer on the dev set, we continue with TF-IDF for the test set prediction.

In [None]:
# Make predictions on the test set
y_test_pred = logreg_model.predict(X_test)# Evaluate the model on the test set

test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Set Accuracy: {test_accuracy}")


# Get a classification report for Test set:
print("Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred))


Test Set Accuracy: 0.9820224719101124
Classification Report (Test Set):
               precision    recall  f1-score   support

     business       0.98      0.96      0.97       102
entertainment       0.96      1.00      0.98        77
     politics       0.98      0.98      0.98        84
        sport       1.00      1.00      1.00       102
         tech       0.99      0.97      0.98        80

     accuracy                           0.98       445
    macro avg       0.98      0.98      0.98       445
 weighted avg       0.98      0.98      0.98       445



## Trying SVM Classifier to further improve accuracy:

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt

# Initialize the SVM classifier
svm_model = SVC(kernel='linear', random_state=42)

# Train the model on the training data
svm_model.fit(X_train, y_train)


In [None]:
# Make predictions on the development set
y_dev_pred_svm = svm_model.predict(X_dev)

# Make predictions on the test set
y_test_pred_svm = svm_model.predict(X_test)


In [None]:
# Evaluate the model on the development set
dev_accuracy_svm = accuracy_score(y_dev, y_dev_pred_svm)
print(f"SVM Development Set Accuracy: {dev_accuracy_svm}")

# Evaluate the model on the test set
test_accuracy_svm = accuracy_score(y_test, y_test_pred_svm)
print(f"SVM Test Set Accuracy: {test_accuracy_svm}")

# Get a classification report
print("SVM Classification Report (Development Set):")
print(classification_report(y_dev, y_dev_pred_svm))


SVM Development Set Accuracy: 0.9685393258426966
SVM Test Set Accuracy: 0.9685393258426966
SVM Classification Report (Development Set):
               precision    recall  f1-score   support

     business       0.94      0.96      0.95       102
entertainment       0.96      0.96      0.96        77
     politics       0.96      0.94      0.95        83
        sport       1.00      1.00      1.00       102
         tech       0.98      0.98      0.98        81

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445



## Using Random Forest Classifier : (For Model Comparison)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


# Initialize the Random Forest classifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the development set
y_dev_pred_rf = rf_model.predict(X_dev)

# Make predictions on the test set
y_test_pred_rf = rf_model.predict(X_test)

# Evaluate the model on the development set
dev_accuracy_rf = accuracy_score(y_dev, y_dev_pred_rf)
print(f"Random Forest Development Set Accuracy: {dev_accuracy_rf}")

# Evaluate the model on the test set
test_accuracy_rf = accuracy_score(y_test, y_test_pred_rf)
print(f"Random Forest Test Set Accuracy: {test_accuracy_rf}")

# Get a classification report for the test set
print("Random Forest Classification Report (Test Set):")
print(classification_report(y_test, y_test_pred_rf))


Random Forest Development Set Accuracy: 0.9640449438202248
Random Forest Test Set Accuracy: 0.9685393258426966
Random Forest Classification Report (Test Set):
               precision    recall  f1-score   support

     business       0.93      0.96      0.95       102
entertainment       0.97      0.97      0.97        77
     politics       0.96      0.93      0.95        84
        sport       1.00      1.00      1.00       102
         tech       0.97      0.97      0.97        80

     accuracy                           0.97       445
    macro avg       0.97      0.97      0.97       445
 weighted avg       0.97      0.97      0.97       445

