## Text Classification

In [1]:
import zipfile
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

## <span style="color:blue;">Task 1: Preprocess the text to remove any stop words or punctuations.</span>

In [2]:
# unzip the labeled and unlabeled datasets
with zipfile.ZipFile('labeled_dataset.zip', 'r') as labeled_zip:
    labeled_zip.extractall('labeled_dataset')

with zipfile.ZipFile('unlabeled_dataset.zip', 'r') as unlabeled_zip:
    unlabeled_zip.extractall('unlabeled_dataset')

# paths
labeled_dataset_path = 'labeled_dataset'
unlabeled_dataset_path = 'unlabeled_dataset'


In [24]:
def remove_stop_punc(txt):
    # Tokenizing the text
    tokens = nltk.word_tokenize(txt)
    # removing the stopwords and punctuations
    tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in stopwords.words('english')]
    # now the cleaned tokens must be join together as a string
    clean_txt = ' '.join(tokens)
    return clean_txt


labeled_data = []
labels = []

for file in os.listdir(labeled_dataset_path):
    if os.path.isfile(os.path.join(labeled_dataset_path, file)):
        with open(os.path.join(labeled_dataset_path, file), 'r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                if line[0:3]!= '###':
                    sent = line[5:]
                    lab = line[:4]
                    labeled_data.append(remove_stop_punc(sent.strip()))
                    labels.append(lab)


In [25]:
labeled_data

['although internet level topology extensively studied past years little known details taxonomy',
 'node represent wide variety organizations e g large isp small private business university vastly different network characteristics external connectivity patterns network growth tendencies properties hardly neglect working veracious internet representations simulation environments',
 'paper introduce radically new approach based machine learning techniques map ases internet natural taxonomy',
 'successfully classify number number percent ases expected accuracy number number percent',
 'release community level topology dataset augmented number taxonomy information number set attributes used classify ases',
 'believe dataset serve invaluable addition understanding structure evolution internet',
 'rapid expansion internet last two decades produced large scale system thousands diverse independently managed networks collectively provide global connectivity across wide spectrum geopolitical env

In [26]:
labels

['MISC',
 'MISC',
 'AIMX',
 'OWNX',
 'OWNX',
 'OWNX',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'AIMX',
 'OWNX',
 'OWNX',
 'AIMX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'MISC',
 'MISC',
 'AIMX',
 'OWNX',
 'OWNX',
 'OWNX',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'AIMX',
 'AIMX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'BASE',
 'OWNX',
 'OWNX',
 'OWNX',
 'MISC',
 'MISC',
 'AIMX',
 'OWNX',
 'OWNX',
 'OWNX',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'AIMX',
 'OWNX',
 'BASE',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'OWNX',
 'AIMX',
 'OWNX',
 'OWNX',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 'MISC',
 

## <span style="color:blue;">Task 2: Use TF-IDF to vectorize the sentences.</span>

In [22]:
X_train, X_test, y_train, y_test = train_test_split(labeled_data, labels, test_size=0.2, random_state=42)   
                    
# Feature extraction using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

## <span style="color:blue;">Task 3: Use Scikit learn to create a (naïve bayes) classifier.</span>

<span style="color:blue;">**First create a classifier (naïve bayes) to classify the given dataset into 6 categories: (AIMX, OWNX, CONT, BASE, NUMBER, and MISC).<br>Then, use the classifier to label the sentences in the unlabeled dataset.**</span>

### 3-1 Evaluating on training dataset and see the accuracy

In [23]:
# Train the Naive Bayes classifier
classifier = MultinomialNB()
classifier.fit(X_train_vectorized, y_train)

y_pred = classifier.predict(X_test_vectorized)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division= np.nan))

Accuracy: 0.76
Classification Report:
               precision    recall  f1-score   support

        AIMX       1.00      0.33      0.50         6
        BASE        nan      0.00       nan         2
        CONT        nan      0.00       nan         7
        MISC       0.71      1.00      0.83        27
        OWNX       0.83      0.90      0.86        21

    accuracy                           0.76        63
   macro avg       0.85      0.45      0.73        63
weighted avg       0.79      0.76      0.81        63



### 3-2 Doing prediction of labels on Unlabeled dataset

In [28]:
# Use the trained classifier to predict labels for unlabeled data
unlabeled_data = []

for file in os.listdir(unlabeled_dataset_path):
    if os.path.isfile(os.path.join(unlabeled_dataset_path, file)):
        with open(os.path.join(unlabeled_dataset_path, file), 'r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                if line[0:3]!= '###':
                    unlabeled_data.append(remove_stop_punc(line.strip()))

# Vectorize the unlabeled data
unlabeled_data_vectorized = vectorizer.transform(unlabeled_data)

# Predict labels for the unlabeled data
predicted_labels = classifier.predict(unlabeled_data_vectorized)

# Print or save the predicted labels for further use
for sentence, label in zip(unlabeled_data, predicted_labels):
    print(f"{label}: {sentence}")

OWNX: transporter analyses conducted 141 organisms whose complete genome sequences available
OWNX: organism complete set membrane transport systems identified predicted functions classified protein families based transporter classification system
MISC: organisms larger genome sizes generally possessed relatively greater number transport systems
MISC: prokaryotes unicellular eukaryotes significant factor increase transporter content genome size greater diversity transporter types
MISC: contrast multicellular eukaryotes greater number paralogs specific transporter families important factor increase transporter content genome size
MISC: eukaryotic prokaryotic intracellular pathogens endosymbionts exhibited markedly limited transport capabilities
MISC: hierarchical clustering phylogenetic profiles transporter families derived presence absence certain transporter family showed clustering patterns organisms correlated evolutionary history overall physiology lifestyles
MISC: membrane transpor

## <span style="color:blue;">Task 4: Summarize your work and your findings in a few sentences.</span>

Initially, I downloaded and extracted both labeled and unlabeled zip files. After that, I wrote a **function to remove stop words and punctuation**. Within the **labeled** file, I utilized a **for-loop** to extract **labels from the texts**. I also **removed the stop words and punctuation from text**. After that I put labels and texts into **seperate lists**. I have done the text preprocessing, tokenization, eliminating stopwords and punctuation to training purposes.

Following this preprocessing, I executed a **train-test split** and employed the **TF-IDF vectorizer** to transform the text data into **vectors**. The TF-IDF feature extraction was conducted using scikit-learn's TfidfVectorizer. In the subsequent phase, I utilized the **Naive Bayes classifier** to assess **predictions on the training dataset**, resulting in a model evaluation **accuracy of 76%**. The Multinomial Naive Bayes classifier was trained on the labeled dataset and evaluated on its test set, achieving an accuracy of 0.76, as indicated in a detailed classification report.

    Classification report: The text classification model achieved an overall accuracy of 76%. The classification report provides a detailed breakdown of precision, recall, and F1-score for each class. Notably, the class AIMX exhibits perfect precision but lower recall, indicating that while the model correctly identifies instances of AIMX, it may miss some. The class BASE has undefined precision, possibly due to no true positive predictions, highlighting a limitation. Similarly, the classes CONT and MISC have undefined precision and recall for certain metrics, possibly due to no true positive or true negative predictions. On the other hand, the class OWNX demonstrates strong precision, recall, and F1-score, suggesting robust performance in identifying instances of OWNX. The overall macro-average and weighted-average metrics provide a comprehensive evaluation of the model's performance across all classes, with a weighted average F1-score of 0.81, showcasing a reasonably effective text classification outcome.

Afterward, I started to work on **unlabeled data** file. I **removed the stop words and punctuation from the unlabeled data file**, and I applied **Naive Bayes model** to **predict labels in this data file**. I utilizing the trained model to predict labels for an unlabeled dataset and printing the results.

**In summary, in this workshop I offered a comprehensive overview of text classification processes, showcasing successful model training, evaluation, and prediction steps.**

### <span style="color:blue;">BONUS [Extra 5% added to any workshops]: Use other classification approaches to label the unlabeled sentences. Evaluate your work using precision, recall, and f1 score.</span>

## <span style="color:orange;">Support Vector Machine (SVM) classifier</span>

In [30]:
from sklearn.svm import SVC

# Split the labeled data for training the SVM classifier
X_train_svm, X_test_svm, y_train_svm, y_test_svm = train_test_split(labeled_data, labels, test_size=0.2, random_state=42)

# Feature extraction using TfidfVectorizer for SVM
vectorizer_svm = TfidfVectorizer()
X_train_vectorized_svm = vectorizer_svm.fit_transform(X_train_svm)
X_test_vectorized_svm = vectorizer_svm.transform(X_test_svm)

# Train the SVM classifier
svm_classifier = SVC()
svm_classifier.fit(X_train_vectorized_svm, y_train_svm)

# Evaluate the SVM classifier
y_pred_svm = svm_classifier.predict(X_test_vectorized_svm)
accuracy_svm = accuracy_score(y_test_svm, y_pred_svm)
print(f"SVM Classifier Accuracy: {accuracy_svm:.2f}")
print("SVM Classification Report:\n", classification_report(y_test_svm, y_pred_svm, zero_division=np.nan))

# Vectorize the unlabeled data for SVM
unlabeled_data_vectorized_svm = vectorizer_svm.transform(unlabeled_data)

# Predict labels for the unlabeled data using SVM
predicted_labels_svm = svm_classifier.predict(unlabeled_data_vectorized_svm)

# Print or save the predicted labels for further use
for sentence, label in zip(unlabeled_data, predicted_labels_svm):
    print(f"{label}: {sentence}")

SVM Classifier Accuracy: 0.73
SVM Classification Report:
               precision    recall  f1-score   support

        AIMX       0.67      0.33      0.44         6
        BASE        nan      0.00       nan         2
        CONT        nan      0.00       nan         7
        MISC       0.69      1.00      0.82        27
        OWNX       0.81      0.81      0.81        21

    accuracy                           0.73        63
   macro avg       0.72      0.43      0.69        63
weighted avg       0.74      0.73      0.77        63

MISC: transporter analyses conducted 141 organisms whose complete genome sequences available
OWNX: organism complete set membrane transport systems identified predicted functions classified protein families based transporter classification system
MISC: organisms larger genome sizes generally possessed relatively greater number transport systems
MISC: prokaryotes unicellular eukaryotes significant factor increase transporter content genome size great

    **RESULT** The Support Vector Machine (SVM) classifier achieved an overall accuracy of 73% on the labeled dataset, its accuracy is less than Naive Bayes classifier. The precision, recall, and F1-score metrics provide a detailed evaluation of the classifier's performance for each class. Notably, the class 'AIMX' exhibits a precision of 67%, indicating that among the instances predicted as 'AIMX,' 67% are true positive. However, the recall for 'AIMX' is relatively low at 33%, suggesting that only 33% of actual 'AIMX' instances were correctly identified by the classifier. The 'MISC' class demonstrates strong performance with a precision of 69%, recall of 100%, and an F1-score of 82%, indicating a well-balanced classification for this category. It's important to note that some classes, such as 'BASE' and 'CONT,' show undefined precision and F1-score due to zero true positive predictions, indicating potential challenges in correctly identifying these classes. Overall, the classifier exhibits reasonable performance, but further tuning and exploration may be beneficial to enhance results, especially for classes with limited instances.

## <span style="color:orange;">Random Forest Classifier</span>

In [31]:
from sklearn.ensemble import RandomForestClassifier

# Split the labeled data for training the Random Forest classifier
X_train_rf, X_test_rf, y_train_rf, y_test_rf = train_test_split(labeled_data, labels, test_size=0.2, random_state=42)

# Feature extraction using TfidfVectorizer for Random Forest
vectorizer_rf = TfidfVectorizer()
X_train_vectorized_rf = vectorizer_rf.fit_transform(X_train_rf)
X_test_vectorized_rf = vectorizer_rf.transform(X_test_rf)

# Train the Random Forest classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train_vectorized_rf, y_train_rf)

# Evaluate the Random Forest classifier
y_pred_rf = rf_classifier.predict(X_test_vectorized_rf)
accuracy_rf = accuracy_score(y_test_rf, y_pred_rf)
print(f"Random Forest Classifier Accuracy: {accuracy_rf:.2f}")
print("Random Forest Classification Report:\n", classification_report(y_test_rf, y_pred_rf, zero_division=np.nan))

# Vectorize the unlabeled data for Random Forest
unlabeled_data_vectorized_rf = vectorizer_rf.transform(unlabeled_data)

# Predict labels for the unlabeled data using Random Forest
predicted_labels_rf = rf_classifier.predict(unlabeled_data_vectorized_rf)

# Print or save the predicted labels for further use
for sentence, label in zip(unlabeled_data, predicted_labels_rf):
    print(f"{label}: {sentence}")


Random Forest Classifier Accuracy: 0.67
Random Forest Classification Report:
               precision    recall  f1-score   support

        AIMX       0.60      0.50      0.55         6
        BASE        nan      0.00       nan         2
        CONT       0.00      0.00       nan         7
        MISC       0.68      0.96      0.80        27
        OWNX       0.72      0.62      0.67        21

    accuracy                           0.67        63
   macro avg       0.50      0.42      0.67        63
weighted avg       0.61      0.67      0.72        63

MISC: transporter analyses conducted 141 organisms whose complete genome sequences available
OWNX: organism complete set membrane transport systems identified predicted functions classified protein families based transporter classification system
MISC: organisms larger genome sizes generally possessed relatively greater number transport systems
MISC: prokaryotes unicellular eukaryotes significant factor increase transporter conte

    **RESULT** The Random Forest classifier achieved an overall accuracy of 67% on the labeled dataset, its accuracy is less than SVM classifier and Naive Bayes classifier. The precision, recall, and F1-score metrics provide insights into the classifier's performance for each class. The 'AIMX' class exhibits a precision of 60%, indicating that 60% of instances predicted as 'AIMX' are true positives. However, the recall for 'AIMX' is 50%, suggesting that only 50% of actual 'AIMX' instances were correctly identified by the classifier. The 'MISC' class shows relatively balanced precision (68%) and recall (96%), resulting in a commendable F1-score of 80%, indicating a robust classification for this category. Notably, some classes, such as 'BASE' and 'CONT,' exhibit undefined precision and F1-score, reflecting challenges in correctly identifying these classes. The classifier demonstrates moderate overall performance, and further parameter tuning or exploration of alternative classifiers may enhance specific class predictions.

### The final result indicates that the Naive Bayes classifier, with an accuracy of 76%, is the better choice for predicting labels on the unlabeled data file compared to the SVM and Random Forest classifiers.