In [9]:
import os
import numpy as np
import nltk
import re
import pickle
from nltk.corpus import stopwords
from sklearn.metrics import f1_score, accuracy_score, precision_score
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split


In [10]:
data_dir = "C:/text-classification/data"
folders = os.listdir(data_dir)
data = []
labels = []
for i, folder in enumerate(folders):
    folder_path = os.path.join(data_dir, folder)
    for file in os.listdir(folder_path):
        with open(os.path.join(folder_path, file), "r") as f:
            text = f.read()
        data.append(text)
        labels.append(folder)

a. For text preprocessing, I have used
* Tokenization: I have used the nltk.word_tokenize() function from the NLTK library to tokenize the text into words.
* Stopwords removal: I have used the stopwords.words('english') function from the NLTK library to remove stopwords from the text.
* Short words removal: I have removed words with less than 3 characters from the text.
* Lemmatization: I have used the WordNetLemmatizer() function from the NLTK library to lemmatize the words in the text.

In [11]:
# preprocessing
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # tokenization into unigrams and bigrams
    tokens = nltk.word_tokenize(text)
    # remove stop words and short words
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]
    # lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(tokens)

In [12]:
preprocessed_data = []
for text in data:
    preprocessed_text = preprocess(text)
    preprocessed_data.append(preprocessed_text)

b. For text vectorization, I have used the TfidfVectorizer from the scikit-learn library. This vectorizer converts text data into numerical feature vectors using the term frequency-inverse document frequency (TF-IDF) approach. I have also used the ngram_range parameter to generate unigrams. This step is important for text classification models, as machine learning algorithms require numerical data as input. 

In [13]:
# Extract features using TF-IDF vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X = vectorizer.fit_transform(preprocessed_data)

In [14]:
# generate new features
document_length = np.array([len(text) for text in preprocessed_data]).reshape(-1, 1)
num_numbers = np.array([len(re.findall(r'\d+', text)) for text in preprocessed_data]).reshape(-1, 1)
num_uppercase_words = np.array([sum(1 for c in text if c.isupper()) for text in preprocessed_data]).reshape(-1, 1)

# feature matrix
X = np.hstack((X.toarray(), document_length, num_numbers, num_uppercase_words))

In [15]:
X.shape

(11413, 28353)

c. 
The machine learning algorithm used was a support vector machine (SVM) with a linear kernel and Random Forest Classifier

In [16]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.15, random_state=42)

# Train multiple machine learning models
Random_Forest_Classifier = RandomForestClassifier()
svm_Classifier = SVC()

In [17]:
Random_Forest_Classifier.fit(X_train, y_train)

In [18]:
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(Random_Forest_Classifier, f)

In [19]:
# predict
random_forest_y_predicted = Random_Forest_Classifier.predict(X_test)

In [20]:
random_forest_f1score = f1_score(y_test, random_forest_y_predicted, average='micro')
random_forest_f1score

0.6442757009345794

In [21]:
random_forest_accuracy = accuracy_score(y_test, random_forest_y_predicted)
random_forest_precision = precision_score(y_test, random_forest_y_predicted, average='micro')
random_forest_f1score = f1_score(y_test, random_forest_y_predicted, average='micro')

print("Random_Forest_Accuracy:", random_forest_accuracy)
print("Random_Forest_Precision:", random_forest_precision)
print("Random_Forest_F1Score:", random_forest_f1score)

Random_Forest_Accuracy: 0.6442757009345794
Random_Forest_Precision: 0.6442757009345794
Random_Forest_F1Score: 0.6442757009345794


In [22]:
svm_Classifier.fit(X_train, y_train)

In [23]:
with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_Classifier, f)

In [24]:
svm_y_predicted = svm_Classifier.predict(X_test)

In [25]:
SVM_f1score = f1_score(y_test,svm_y_predicted, average="micro")
SVM_f1score

0.3317757009345794

In [26]:
svm_accuracy = accuracy_score(y_test, svm_y_predicted)
svm_precision = precision_score(y_test, svm_y_predicted, average='weighted')
SVM_f1score = f1_score(y_test,svm_y_predicted, average="micro")

print("SVM_Accuracy:", svm_accuracy)
print("SVM_Precision:", svm_precision)
print("SVM_F1Score:", SVM_f1score)

SVM_Accuracy: 0.3317757009345794
SVM_Precision: 0.24672457273531334
SVM_F1Score: 0.3317757009345794


  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
# multinomial Naive-Bayes classifier
naive_bayes_classifier = MultinomialNB()

# train
naive_bayes_classifier.fit(X_train, y_train)

In [28]:
with open('naive_bayes_model.pkl', 'wb') as f:
    pickle.dump(naive_bayes_classifier, f)

In [29]:
# predict
naive_bayes_y_predicted = naive_bayes_classifier.predict(X_test)

In [30]:
naive_bayes_f1score = f1_score(y_test, naive_bayes_y_predicted, average='micro')
naive_bayes_f1score

0.4024532710280374

In [31]:
naive_bayes_accuracy = accuracy_score(y_test, naive_bayes_y_predicted)
naive_bayes_precision = precision_score(y_test, naive_bayes_y_predicted, average='weighted')
naive_bayes_f1score = f1_score(y_test, naive_bayes_y_predicted, average='micro')

print("Naive_Bayes_Accuracy:", naive_bayes_accuracy)
print("Naive_Bayes_Precision:", naive_bayes_precision)
print("Naive_Bayes_F1Score:", naive_bayes_f1score)

Naive_Bayes_Accuracy: 0.4024532710280374
Naive_Bayes_Precision: 0.28334849573395643
Naive_Bayes_F1Score: 0.4024532710280374


  _warn_prf(average, modifier, msg_start, len(result))


e. Models Evaluation
* The three evaluation metrics used in this project are accuracy score, precision score, and F1 score.
* Random Forest Classifier is better

d . Results:

In [32]:
random_forest_sample_prediction = Random_Forest_Classifier.predict(X_test[0].reshape(1,-1))
svm_sample_prediction = svm_Classifier.predict(X_test[0].reshape(1,-1))
naive_bayes_sample_prediction = naive_bayes_classifier.predict(X_test[0].reshape(1,-1))

In [33]:
random_forest_sample_prediction

array(['gnp'], dtype='<U15')

In [34]:
svm_sample_prediction

array(['earn'], dtype='<U15')

In [35]:
naive_bayes_sample_prediction 

array(['earn'], dtype='<U15')

In [36]:
# actual label
y_test[0]

'gnp'

* Actual class is "gnp"
* Random Forest Predicted class is "gnp"
* SVM Predicted class is "Earn"
* Naive Bayes Predicted class is "Earn"

f. Features that could enhance the performance
* Part-of-speech tags: This feature involves labeling each word in the text with its corresponding part-of-speech tag (e.g., noun, verb, adjective, etc.). This information could be useful for tasks such as named entity recognition, sentiment analysis, and text classification, as the grammatical structure of the text can often provide important clues about its meaning.
* Named entity recognition: This feature involves identifying and classifying specific entities (such as people, organizations, and locations) mentioned in the text. This information can be useful for a wide range of classification tasks, including sentiment analysis, topic modeling, and text classification.