In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
import joblib
import os

input_dir = '/content/drive/My Drive/Colab/AS4/STEP2-feature_Engineering'   # fixed folder name
output_dir = '/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation'

# Load BoW and N-grams feature dataframes
df_bow = pd.read_pickle(os.path.join(input_dir, 'features_bow.pkl'))
df_ngram = pd.read_pickle(os.path.join(input_dir, 'features_ngram2.pkl'))

# Load corresponding vectorizers
bow_vectorizer = joblib.load(os.path.join(input_dir, 'vectorizer_bow.pkl'))
ngram_vectorizer = joblib.load(os.path.join(input_dir, 'vectorizer_ngram2.pkl'))

# Prepare labels
y = df_bow['label_num'].values

# Drop label columns
X_bow = df_bow.drop(['label', 'label_num'], axis=1).values
X_ngram = df_ngram.drop(['label', 'label_num'], axis=1).values

# Combine BoW + N-gram features
X_combined = np.hstack([X_bow, X_ngram])

# Train model
model = MultinomialNB()
model.fit(X_combined, y)

# Save everything
os.makedirs(output_dir, exist_ok=True)
joblib.dump(model, os.path.join(output_dir, 'naive_bayes_bow_ngram.pkl'))
joblib.dump(bow_vectorizer, os.path.join(output_dir, 'bow_vectorizer.pkl'))
joblib.dump(ngram_vectorizer, os.path.join(output_dir, 'ngram2_vectorizer.pkl'))

print(f"Champion Naive Bayes model trained and saved to:\n{output_dir}")


Champion Naive Bayes model trained and saved to:
/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation


Create the Prediction Inference & Code Export predictions

In [4]:
import numpy as np
import pandas as pd
import joblib

# Paths to saved model and vectorizers
MODEL_PATH = '/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation/naive_bayes_bow_ngram.pkl'
BOW_VECTORIZER_PATH = '/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation/bow_vectorizer.pkl'
NGRAM_VECTORIZER_PATH = '/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation/ngram2_vectorizer.pkl'

# Load trained model and vectorizers once
model = joblib.load(MODEL_PATH)
bow_vectorizer = joblib.load(BOW_VECTORIZER_PATH)
ngram_vectorizer = joblib.load(NGRAM_VECTORIZER_PATH)

# Define label mapping (update as needed)
label_map = {
    0: 'Gene Expression Analysis',
    1: 'Sequence Classification',
    2: 'Protein Structure Prediction',
    3: 'Biological Image Analysis',
    4: 'Disease Outcome Prediction'
}

def predict_paper_category(text):
    # Transform input text to features
    bow_features = bow_vectorizer.transform([text])
    ngram_features = ngram_vectorizer.transform([text])

    # Combine BoW + N-gram features horizontally
    X = np.hstack([bow_features.toarray(), ngram_features.toarray()])

    # Predict label number and map to label string
    label_num = model.predict(X)[0]
    return label_map[label_num]

# Load your labeled dataset (make sure the 'Text' column exists)
df = pd.read_excel('/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx')

# Apply prediction to each text entry (this may take some time)
df['predicted_Classification_label'] = df['Text'].apply(predict_paper_category)
# Save the DataFrame with predicted labels
output_path = '/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation/Raw_with_predicted_classification_label.xlsx'
df.to_excel(output_path, index=False)

print(f"Prediction complete and saved to:\n{output_path}")


Prediction complete and saved to:
/content/drive/My Drive/Colab/AS4/STEP4-Champion_Clustering&Classification_Save&Evaluation/Raw_with_predicted_classification_label.xlsx
