In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
df_train= pd.read_csv("/content/drive/MyDrive/learn-ai-bbc/BBC News Train.csv")
df_train.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [None]:
train_x= df_train.drop("Category",axis=1)
# df= pd.concat([train_x, df_test],axis=0)
# df.shape

In [None]:
train_x.head()

Unnamed: 0,ArticleId,Text
0,1833,worldcom ex-boss launches defence lawyers defe...
1,154,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...
3,1976,lifestyle governs mobile choice faster bett...
4,917,enron bosses in $168m payout eighteen former e...


#Preprocessing the data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
# Preprocessing the text data by removing stop words, punctuation and stemming
stop_words = set(stopwords.words('english'))
# Initialize stemmer
stemmer = PorterStemmer()
#lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Remove URLs and email addresses
    #text = re.sub(r'\S+@\S+', '', text)
    #text = re.sub(r'http\S+', '', text)

    # Remove numbers
    #text = re.sub(r'\d+', '', text)

    # Remove punctuation
    #text =  re.sub(r'[^\w\s]', '', text)
    
    # Convert to lowercase
    text = text.lower()

    # Remove whitespace
    text=  re.sub(' +', ' ', text)

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Perform stemming
    #stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    #lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    return ' '.join(stemmed_tokens)


#TC-ICF Calculation

In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import csr_matrix


In [None]:
# Preprocess the text column
preprocessed_text = [preprocess_text(text) for text in train_x['Text']]

# Define the count vectorizer
count_vectorizer = CountVectorizer()

# Generate the term-document matrix
term_doc_matrix = count_vectorizer.fit_transform(preprocessed_text)

# Define the TF-IDF transformer
tfidf_transformer = TfidfTransformer()

# Generate the TF-IDF matrix
tfidf_matrix = tfidf_transformer.fit_transform(term_doc_matrix).toarray()

# Compute the ICF values
num_docs = len(train_x)
icf_values = np.log(num_docs / np.count_nonzero(tfidf_matrix, axis=0))

# Convert tfidf_matrix and icf_values to sparse matrices
tfidf_matrix = csr_matrix(tfidf_matrix)
icf_values = csr_matrix(icf_values)

# Compute the TF-ICF matrix
tf_icf_matrix = tfidf_matrix.multiply(icf_values)


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tf_icf_matrix,df_train['Category'],test_size=0.3, random_state=42)

# Print the shapes of the training and testing sets
print('Training set shape:', X_train.shape)
print('Testing set shape:', X_test.shape)

Training set shape: (1043, 17570)
Testing set shape: (447, 17570)


In [None]:

# y_train= df_train['Category']
# y_test = label_test['Category']

print(y_train.shape, y_test.shape)

(1043,) (447,)


#Training naive bayes with tc-icf

In [None]:
# Train the Naive Bayes classifier
clf = MultinomialNB().fit(X_train, y_train)

#Testing and getting predictions

In [None]:
# Test the classifier
accuracy = clf.score(X_test, y_test)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


Accuracy: 97.76%


In [None]:
# Get predictions on test set
y_pred = clf.predict(X_test)

# Calculate accuracy and precision
from sklearn.metrics import accuracy_score, precision_score,recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='macro')
# Print results
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 97.76%
Precision: 97.78%
Recall: 97.80%


In [None]:
# Calculate the frequency of each category in the training set
category_freq = y_train.value_counts(normalize=True)

# Print the frequency of each category
print('Category frequency:\n', category_freq)

Category frequency:
 sport            0.234899
business         0.218600
entertainment    0.186002
tech             0.180249
politics         0.180249
Name: Category, dtype: float64


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Loop over the unique categories in the training set
for category in y_train.unique():
    # Get the indices of the training samples that belong to this category
    category_indices = y_train[y_train == category].index
    # Calculate the average TF-IDF value for each feature in documents belonging to this category
    avg_tficf = tf_icf_matrix[category_indices].mean(axis=0)
    # Convert the average TF-IDF value to a 1D array
    avg_tficf = np.squeeze(np.asarray(avg_tficf))
    # Print the feature names and their corresponding average TF-IDF values
    feature_names = tfidf_transformer.get_feature_names_out()
    for feature_idx in np.argsort(avg_tficf)[-10:]:
        feature_name = feature_names[feature_idx]
        feature_tficf = avg_tficf[feature_idx]
        print(f'{category}: {feature_name} - {feature_tficf:.4f}')

entertainment: x2577 - 0.0813
entertainment: x11201 - 0.0825
entertainment: x14923 - 0.0852
entertainment: x1493 - 0.0859
entertainment: x10875 - 0.0868
entertainment: x11521 - 0.0907
entertainment: x1288 - 0.0927
entertainment: x2269 - 0.0967
entertainment: x2143 - 0.1140
entertainment: x6387 - 0.1577
tech: x15578 - 0.0818
tech: x9916 - 0.0826
tech: x4119 - 0.0850
tech: x14604 - 0.0881
tech: x10450 - 0.0907
tech: x6837 - 0.0939
tech: x2746 - 0.0942
tech: x3055 - 0.0978
tech: x12082 - 0.1271
tech: x10629 - 0.1637
politics: x5971 - 0.0813
politics: x8068 - 0.0826
politics: x10806 - 0.0833
politics: x3092 - 0.0912
politics: x11802 - 0.0979
politics: x15925 - 0.1045
politics: x9730 - 0.1052
politics: x9243 - 0.1192
politics: x2699 - 0.1322
politics: x5674 - 0.1405
business: x7108 - 0.0603
business: x7145 - 0.0612
business: x8331 - 0.0616
business: x5313 - 0.0620
business: x4847 - 0.0629
business: x7375 - 0.0640
business: x5608 - 0.0667
business: x11387 - 0.0740
business: x2286 - 0.0770
bu

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tf_icf_matrix,df_train['Category'],test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
clf = MultinomialNB().fit(X_train, y_train)
# Get predictions on test set
y_pred = clf.predict(X_test)

# Calculate accuracy and precision
from sklearn.metrics import accuracy_score, precision_score,recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='macro')
# Print results
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 96.98%
Precision: 97.02%
Recall: 97.09%


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tf_icf_matrix,df_train['Category'],test_size=0.4, random_state=42)

# Train the Naive Bayes classifier
clf = MultinomialNB().fit(X_train, y_train)
# Get predictions on test set
y_pred = clf.predict(X_test)

# Calculate accuracy and precision
from sklearn.metrics import accuracy_score, precision_score,recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='macro')
# Print results
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 97.32%
Precision: 97.38%
Recall: 97.30%


In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tf_icf_matrix,df_train['Category'],test_size=0.5, random_state=42)

# Train the Naive Bayes classifier
clf = MultinomialNB().fit(X_train, y_train)
# Get predictions on test set
y_pred = clf.predict(X_test)

# Calculate accuracy and precision
from sklearn.metrics import accuracy_score, precision_score,recall_score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='macro')
# Print results
print("Accuracy: %.2f%%" % (accuracy * 100.0))
print("Precision: %.2f%%" % (precision * 100.0))
print("Recall: %.2f%%" % (recall * 100.0))

Accuracy: 98.26%
Precision: 98.28%
Recall: 98.23%


#Trying with n-gram tfidf vectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
# Define the number of n-grams to use
ngram_range = (1, 2)

# Create a TfidfVectorizer with the TF-IDF weighting scheme and n-gram range
vectorizer = TfidfVectorizer(ngram_range=ngram_range)

In [None]:
# Preprocess the text column
train_x['preprocessed_text'] = [preprocess_text(text) for text in train_x['Text']]

from sklearn.model_selection import train_test_split
# Split the dataset into training and test sets
train_data, test_data, train_labels, test_labels = train_test_split(train_x['preprocessed_text'], df_train['Category'], test_size=0.3, random_state=42)
# train_labels= df_train['Category']
# test_labels= label_test['Category']
# Print the shapes of the resulting data subsets
print('Training data shape: {}'.format(train_data.shape))
print('Training labels shape: {}'.format(train_labels.shape))
print('Test data shape: {}'.format(test_data.shape))
print('Test labels shape: {}'.format(test_labels.shape))

Training data shape: (1043,)
Training labels shape: (1043,)
Test data shape: (447,)
Test labels shape: (447,)


In [None]:
# Create a Multinomial Naive Bayes classifier
clf = MultinomialNB()
# Create a pipeline that combines the vectorizer and classifier
pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('classifier', clf)
])

# Train the classifier on the training data
pipeline.fit(train_data, train_labels)

In [None]:
# Evaluate the classifier on the test data
accuracy = pipeline.score(test_data, test_labels)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

Accuracy: 96.64%
