# Pre-processing
### Cleaning
We start by using a regex to remove any HTML elements, to clean the text data.

### Lowercase
We then lowercase every word to ensure uniformity and prevent the model from treating words differently based on their capitalization.

### Special Characters / Punctuation
For this particular problem, special characters and punctuation marks don't contribute much to the meaning of the text and can be removed (opinion could be different if this was a sentiment analysis instead of a topic classification problem)

### Tokenization
We then split the text into individual words/tokens, as it helps capture semantic more effectively.

### Stopwords
Stopwords are common words that occur frequently in language, but, in a topic classification problem, carry virtually no useful information. Removing them reduces the noise in the data.

### Stemming
We reduce words to their base or root form, as it reduces the size of vocabulary and the dimensionality of the feature space.

In [None]:
import pandas as pd

def tokenize_numbers(text):
    # Define regular expressions for different types of numbers
    year_pattern = r'\b\d{4}\b'
    percentage_pattern = r'\b\d+(?:\.\d+)?\s*%\b'
    time_pattern = r'\b\d+\s*(?:hours?|mins?|minutes?|secs?|seconds?)\b'
    number_pattern = r'\b\d+\b'
    date_pattern = r'\b\d{1,2}/\d{1,2}/\d{4}\b'  # Matches dates in mm/dd/yyyy format

    # Tokenize numbers based on patterns
    text = re.sub(year_pattern, 'tokenyear', text)
    text = re.sub(percentage_pattern, 'percentage', text)
    text = re.sub(time_pattern, 'timetoken', text)
    text = re.sub(date_pattern, 'datetoken', text)
    text = re.sub(number_pattern, 'tokennumber', text)

    return text

# Example usage
text = "I was born in 1982 and got 92% in my exam. It took me 92 minutes on 03/15/2024 to finish the race."
tokenized_text = tokenize_numbers(text)
print(tokenized_text)

test_data = pd.read_csv('training_data.csv', delimiter=',', quotechar='"')

# Cleaning
test_data['text'] = test_data['text'].str.replace(r'#\d+;', '', regex=True)

# Lowercase
test_data['text'] = test_data['text'].str.lower()

# Special Characters / Punctuation
test_data['text'] = test_data['text'].str.replace(r'[^\w\s]', '', regex=True)

# Tokenization
import nltk
test_data['tokens'] = test_data['text'].apply(nltk.word_tokenize)

# Stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
test_data['tokens'] = test_data['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])

# Tokenize numbers based on patterns
test_data['text'] = tokenize_numbers(test_data['text'])

# Stemming
from nltk import PorterStemmer
stemmer = PorterStemmer()

def stem_tokens(tokens):
    return [stemmer.stem(token) for token in tokens]

test_data['tokens'] = test_data['tokens'].apply(stem_tokens)

print(test_data['tokens'])

1. Should we remove numbers?
2. Negation words


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

# Assuming you have already preprocessed the data and created feature matrices X_train and X_test
X_train = news_data['text']
y_train = news_data['label']
X_val = test_data['text']
y_val = test_data['label']
# Initialize CountVectorizer to convert text data into numerical features
vectorizer = CountVectorizer(analyzer='word',max_features=5000,lowercase=True,stop_words='english',ngram_range=(1,2))
# Fit the vectorizer on training data and transform the training data
X_train = vectorizer.fit_transform(X_train)
# Transform the test data using the fitted vectorizer
X_test = vectorizer.transform(X_val)


In [None]:
print(vectorizer.vocabulary_)

In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
model_nb = MultinomialNB()
model_dt = DecisionTreeClassifier(random_state=123)
model_rf = RandomForestClassifier(random_state=123)
model_lr = LogisticRegression(solver='lbfgs', max_iter=1000)

In [5]:
model_nb = model_nb.fit(X=X_train,y=y_train)
model_dt = model_dt.fit(X=X_train,y=y_train)
model_rf = model_rf.fit(X=X_train,y=y_train)
model_lr = model_lr.fit(X=X_train,y=y_train)

In [None]:
y_pred_nb = model_nb.predict(X_val)
y_pred_dt = model_dt.predict(X_val)
y_pred_rf = model_rf.predict(X_val)
y_pred_lr = model_lr.predict(X_val)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
print("Naive Bayes", accuracy_score(y_val,y_pred_nb))
print(confusion_matrix(y_val,y_pred_nb))
print()
print("Decision Tree", accuracy_score(y_val,y_pred_dt))
print(confusion_matrix(y_val,y_pred_dt))
print()
print("Random Forest", accuracy_score(y_val,y_pred_rf))
print(confusion_matrix(y_val,y_pred_rf))
print()
print("Logistic Regression", accuracy_score(y_val,y_pred_lr))
print(confusion_matrix(y_val,y_pred_lr))