# Pre-processing
### Cleaning
We start by using a regex to remove any HTML elements, to clean the text data.

### Lowercase
We then lowercase every word to ensure uniformity and prevent the model from treating words differently based on their capitalization.

### Special Characters / Punctuation
For this particular problem, special characters and punctuation marks don't contribute much to the meaning of the text and can be removed (opinion could be different if this was a sentiment analysis instead of a topic classification problem)

### Tokenization
We then split the text into individual words/tokens, as it helps capture semantic more effectively.

### Stopwords
Stopwords are common words that occur frequently in language, but, in a topic classification problem, carry virtually no useful information. Removing them reduces the noise in the data.

### Stemming
We reduce words to their base or root form, as it reduces the size of vocabulary and the dimensionality of the feature space.

In [None]:
import pandas as pd
import re
import nltk
from nltk import PorterStemmer
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')

stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

def tokenize_numbers(text):
    year_pattern = r'\b\d{4}\b'
    percentage_pattern = r'\b\d+(?:\.\d+)?%'
    time_pattern = r'\b\d+\s*(?:hours?|mins?|minutes?|secs?|seconds?|days?|weeks?|months?|years?|decades?)\b'
    number_pattern = r'\b\d+\b'
    date_pattern = r'\b\d{1,2}/\d{1,2}/\d{4}\b'  # Matches dates in mm/dd/yyyy and dd/mm/yyyy formats

    # Tokenize numbers based on patterns
    text = re.sub(percentage_pattern, 'percentagetoken', text)
    text = re.sub(date_pattern, 'datetoken', text)
    text = re.sub(year_pattern, 'yeartoken', text)
    text = re.sub(time_pattern, 'timetoken', text)
    text = re.sub(number_pattern, 'numbertoken', text)

    return text

def process_data(data, type):
    if(type >= 2):
        data['text'] = data['text'].apply(tokenize_numbers)

    # Cleaning
    data['text'] = data['text'].str.replace(r'#\d+;', '', regex=True)

    # Lowercase
    data['text'] = data['text'].str.lower()

    # Special Characters / Punctuation
    data['text'] = data['text'].str.replace(r'[^\w\s]', '', regex=True)

    # Tokenization
    data['tokens'] = data['text'].apply(nltk.word_tokenize)

    # Stopwords
    stop_words = set(stopwords.words('english'))
    if(type >= 2):
        stop_words.update(['AP', 'Reuters', 'space.com', 'techweb', 'MacCentral', 'PC World', 'USATODAY.COM', 'CNN'])
    data['tokens'] = data['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

    data['tokens'] = data['tokens'].apply(stem_tokens)
    
    print(data['tokens'])
    
    return data

In [117]:
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from pycaret.classification import *
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix



def train_data(training_data, test_data):
    X_train = training_data['text']
    y_train = training_data['label']
    X_val = test_data['text']
    y_val = test_data['label']
    
    vectorizer = CountVectorizer(analyzer='word', max_features=5000, lowercase=True, stop_words='english', ngram_range=(1, 2))
    
    X_train = vectorizer.fit_transform(X_train)
    
    X_val = vectorizer.transform(X_val)  # Transform validation data
    
    model_nb = MultinomialNB()
    model_dt = DecisionTreeClassifier(random_state=123)
    model_rf = RandomForestClassifier(random_state=123)
    model_lr = LogisticRegression(solver='lbfgs', max_iter=1000)
        
    print('----------------------------------------------------------------')
    model_nb = model_nb.fit(X=X_train, y=y_train)
    y_pred_nb = model_nb.predict(X_val)
    print("Naive Bayes Accuracy:", accuracy_score(y_val, y_pred_nb))
    print("Naive Bayes Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred_nb))
    print("Naive Bayes Classification Report:")
    print(classification_report(y_val, y_pred_nb))
    print()
    '''
    print('----------------------------------------------------------------')
    model_dt = model_dt.fit(X=X_train, y=y_train)
    y_pred_dt = model_dt.predict(X_val)
    print("Decision Tree Accuracy:", accuracy_score(y_val, y_pred_dt))
    print("Decision Tree Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred_dt))
    print("Decision Tree Classification Report:")
    print(classification_report(y_val, y_pred_dt))
    print()
    
    print('----------------------------------------------------------------')
    model_rf = model_rf.fit(X=X_train, y=y_train)
    y_pred_rf = model_rf.predict(X_val)
    print("Random Forest Accuracy:", accuracy_score(y_val, y_pred_rf))
    print("Random Forest Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred_rf))
    print("Random Forest Classification Report:")
    print(classification_report(y_val, y_pred_rf))
    print()
    '''
    print('----------------------------------------------------------------')
    model_lr = model_lr.fit(X=X_train, y=y_train)
    y_pred_lr = model_lr.predict(X_val)
    print("Logistic Regression Accuracy:", accuracy_score(y_val, y_pred_lr))
    print("Logistic Regression Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred_lr))
    print("Logistic Regression Classification Report:")
    print(classification_report(y_val, y_pred_lr))
        

In [None]:
training_data = pd.read_csv('training_data.csv')
#training_data = pd.read_csv('training_data_xs.csv')
test_data = pd.read_csv('test_data.csv')

training_data = process_data(training_data, 2)
test_data = process_data(test_data, 2)

In [None]:
train_data(training_data, test_data)