# Preprocessing Phase

In [None]:
import nltk

nltk.download('all')

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string as st
import re
from nltk import PorterStemmer, WordNetLemmatizer

# Input data files are available in the read-only "./input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read the data. Here it is already in .csv format.
train_data = pd.read_csv('dataset/BBC News Train.csv')
test_data = pd.read_csv('dataset/BBC News Test.csv')
data = pd.concat([train_data,test_data])
data.to_csv('dataset/data.csv', index=False)
data.head(10)

In [None]:
data.shape

# Text cleaning and processing steps
* Remove punctuations
* Convert text to tokens
* Remove tokens of length less than or equal to 3
* Remove stopwords using NLTK corpus stopwords list to match
* Apply lemmatization
* Convert words to feature vectors

In [None]:
# Remove all punctuations from the text

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

In [None]:
data['removed_punc'] = data['Text'].apply(lambda x: remove_punct(x))
data.head()

In [None]:
''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
    on special characters, tabs or any other string based on which text is to be seperated into tokens.
'''
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [None]:
data['tokens'] = data['removed_punc'].apply(lambda msg : tokenize(msg))
data.head()

In [None]:
# Remove tokens of length less than 3

def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

In [None]:
data['larger_tokens'] = data['tokens'].apply(lambda x : remove_small_words(x))
data.head()

In [None]:
''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
    list could be created and used to limit the matches in input text. 
'''
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [None]:
data['clean_tokens'] = data['larger_tokens'].apply(lambda x : remove_stopwords(x))
data.head()

In [None]:
# Apply lemmatization on tokens
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [None]:
data['lemma_words'] = data['clean_tokens'].apply(lambda x : lemmatize(x))
data.head()

In [None]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [None]:
data['clean_text'] = data['lemma_words'].apply(lambda x : return_sentences(x))
data.head()


# Model and Evaluation Phase

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
from sklearn.utils import resample

def balance_data(data, category_col):
    categories = data[category_col].unique()
    min_category_count = data[category_col].value_counts().min()

    balanced_data = []

    for category in categories:
        category_data = data[data[category_col] == category]
        category_data_balanced = resample(category_data, replace=False, n_samples=min_category_count, random_state=42)
        balanced_data.append(category_data_balanced)

    return pd.concat(balanced_data)


In [None]:
balanced_data = balance_data(data, 'Category')
X_train, X_test, y_train, y_test = train_test_split(balanced_data['clean_text'], balanced_data['Category'], test_size=0.2, random_state=42)

In [None]:
y_train_counts = y_train.value_counts()
print("Category counts in the training set:")
print(y_train_counts)

y_test_counts = y_test.value_counts()
print("\nCategory counts in the testing set:")
print(y_test_counts)



In [None]:
#vectorizer = CountVectorizer()
#X_train_vec = vectorizer.fit_transform(X_train)
#X_test_vec = vectorizer.transform(X_test)

# OR

vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=5)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
categories = y_train.unique()
category_avg_vecs = {}

for category in categories:
    category_indices = y_train[y_train == category].index
    category_vectors = X_train_vec[category_indices, :]
    category_avg_vec = np.mean(category_vectors, axis=0)
    category_avg_vecs[category] = category_avg_vec


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def predict_category(text, vectorizer, category_avg_vecs):
    text_vec = vectorizer.transform([text])
    max_similarity = -1
    predicted_category = None

    for category, avg_vec in category_avg_vecs.items():
        similarity = cosine_similarity(text_vec, np.asarray(avg_vec)) # Convert avg_vec to a numpy array
        if similarity > max_similarity:
            max_similarity = similarity
            predicted_category = category

    return predicted_category



In [None]:
y_pred = [predict_category(text, vectorizer, category_avg_vecs) for text in X_test]

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

print("Classification Report:\n", classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)


In [None]:
def train_and_evaluate(classifier, X_train_vec, y_train, X_test_vec, y_test):
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    classification_report_dict = classification_report(y_test, y_pred, output_dict=True)
    
    print(classifier.__class__.__name__)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")
    
    performance = {
        "accuracy": accuracy,
        "precision": classification_report_dict["macro avg"]["precision"],
        "recall": classification_report_dict["macro avg"]["recall"],
        "f1_score": classification_report_dict["macro avg"]["f1-score"],
    }
    
    return performance

In [None]:
performance_metrics = {}

classifiers = [
    MultinomialNB(),
    LogisticRegression(),
    LinearSVC(),
    RandomForestClassifier(),
    KNeighborsClassifier()
]


for classifier in classifiers:
    performance_metrics[classifier.__class__.__name__] = train_and_evaluate(classifier, X_train_vec, y_train, X_test_vec, y_test)


In [None]:
for classifier, metrics in performance_metrics.items():
    print(f"{classifier}:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")
    print("\n")


In [None]:
from sklearn.model_selection import cross_val_score

def evaluate_with_cross_val(classifier, X, y, n_splits=5):
    scores = cross_val_score(classifier, X, y, cv=n_splits)
    return np.mean(scores)

X_vec = vectorizer.fit_transform(data['clean_text'])
y = data['Category']

for classifier in classifiers:
    mean_score = evaluate_with_cross_val(classifier, X_vec, y)
    print(f"{classifier.__class__.__name__}: {mean_score:.4f}")


In [None]:
import seaborn as sns
from sklearn.metrics import confusion_matrix


def plot_cm(y_true, y_pred, class_names):
    cm = confusion_matrix(y_true, y_pred)
    cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.figure(figsize=(8, 8))
    sns.heatmap(cm_normalized, annot=True, cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.show()

class_names = data['Category'].unique()

for classifier in classifiers:
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)
    print(f"Confusion Matrix for {classifier.__class__.__name__}:")
    plot_cm(y_test, y_pred, class_names)




Cosine similarity

In [None]:
# The test_cosine_similarity_with_profiles function
def test_cosine_similarity_with_profiles(text, vectorizer, category_profiles):
    text_vec = vectorizer.transform([text])
    max_similarity = -1
    predicted_category = None

    for category, profile_keywords in category_profiles.items():
        profile_vec = vectorizer.transform([' '.join(profile_keywords)])
        similarity = cosine_similarity(text_vec, profile_vec)
        if similarity > max_similarity:
            max_similarity = similarity
            predicted_category = category

    return predicted_category


In [None]:
category_profiles = {
    'Sports': ['football', 'soccer', 'basketball', 'tennis', 'cricket', 'golf', 'rugby', 'athletics', 'swimming', 'baseball', 'hockey', 'olympics'],
    'Politics': ['election', 'government', 'policy', 'legislation', 'parliament', 'president', 'prime-minister', 'congress', 'senate', 'vote', 'party', 'diplomacy'],
    'Business': ['finance', 'economy', 'stock', 'market', 'investment', 'trade', 'banking', 'corporation', 'revenue', 'profit', 'loss', 'growth', 'startup'],
    'Entertainment': ['movie', 'music', 'television', 'celebrity', 'actor', 'actress', 'singer', 'festival', 'award', 'concert', 'theater', 'art', 'culture'],
    'Tech': ['technology', 'software', 'hardware', 'internet', 'computer', 'smartphone', 'artificial-intelligence', 'robotics', 'data', 'security', 'innovation', 'research'],
    'Sports-Politics': ['football', 'soccer', 'basketball', 'election', 'government', 'policy', 'legislation', 'vote', 'party'],
}

In [None]:
# Test the performance using the keyword profiles
y_pred_profiles = [test_cosine_similarity_with_profiles(text, vectorizer, category_profiles) for text in X_test]

# Calculate accuracy and display results
accuracy_profiles = accuracy_score(y_test, y_pred_profiles)
print(f"Accuracy with keyword profiles: {accuracy_profiles:.4f}")

print("Classification Report with keyword profiles:\n", classification_report(y_test, y_pred_profiles))

cm_profiles = confusion_matrix(y_test, y_pred_profiles)
print("Confusion Matrix with keyword profiles:\n", cm_profiles)