In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# ============================
# Step 0: Setup & Preprocessing
# ============================
import pandas as pd
import random
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources if not already
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# ============================
# Step 1: Text Cleaning Utilities
# ============================
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(words)

def truncate_text(text, target_words=150):
    words = text.split()
    if len(words) >= target_words:
        start = random.randint(0, len(words) - target_words)
        return ' '.join(words[start:start + target_words])
    else:
        return text

# ============================
# Step 2: Main Processing Function
# ============================
def prepare_final_dataset(excel_paths, file_labels, samples_per_file=200, words_per_doc=150):
    all_records = []

    for path, file_label in zip(excel_paths, file_labels):
        df = pd.read_excel(path)
        df_sampled = df.sample(n=min(samples_per_file, len(df)), random_state=42)

        for _, row in df_sampled.iterrows():
            combined_text = f"{row.get('Article Title', '')} {row.get('Abstract', '')}"
            cleaned = clean_text(combined_text)
            final_text = truncate_text(cleaned, target_words=words_per_doc)

            # Copy original row to dict, then add processed fields
            record = row.to_dict()
            record['Text'] = final_text
            record['Label'] = file_label

            all_records.append(record)

    return pd.DataFrame(all_records)

# ============================
# Step 3: Execute the Pipeline
# ============================
excel_files = [f'/content/drive/My Drive/Colab/AS4/dataset-{i}.xlsx' for i in range(1, 6)]
file_labels = [
    'gene expression analysis',
    'sequence classification and alignment',
    'protein structure and function prediction',
    'biological image analysis',
    'disease outcome prediction'
]

df_final = prepare_final_dataset(excel_files, file_labels)

# ============================
# Step 4: Export only one final file
# ============================
output_path_final = '/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx'
df_final.to_excel(output_path_final, index=False)
print(f"Final dataset saved to:\n{output_path_final}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Final dataset saved to:
/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx


In [None]:
# ============================
# Step 0: Setup & Preprocessing
# ============================
import pandas as pd
import random
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources if not already
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# ============================
# Step 1: Text Cleaning Utilities
# ============================
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [w for w in words if w not in stop_words]
    words = [lemmatizer.lemmatize(w) for w in words]
    return ' '.join(words)

def truncate_text(text, target_words=150):
    words = text.split()
    if len(words) >= target_words:
        start = random.randint(0, len(words) - target_words)
        return ' '.join(words[start:start + target_words])
    else:
        return text

# ============================
# Step 2: Main Processing Function
# ============================
def prepare_labeled_dataset(excel_paths, file_labels, samples_per_file=200, words_per_doc=150):
    all_records = []

    for path, file_label in zip(excel_paths, file_labels):
        df = pd.read_excel(path)
        df_sampled = df.sample(n=min(samples_per_file, len(df)), random_state=42)

        for _, row in df_sampled.iterrows():
            combined_text = f"{row['Article Title']} {row['Abstract']}"
            cleaned = clean_text(combined_text)
            processed = truncate_text(cleaned, target_words=words_per_doc)

            record = {
                'Author': row.get('Author', ''),
                'Article Title': row.get('Article Title', ''),
                'Document Type': row.get('Document Type', ''),
                'Keywords': row.get('Keywords', ''),
                'Abstract': row.get('Abstract', ''),
                'Times Cited': row.get('Times Cited', ''),
                'Publication Year': row.get('Publication Year', ''),
                'DOI Link': row.get('DOI Link', ''),
                'Original Label': row.get('Label', ''),
                'combined_text': combined_text,
                'cleaned': cleaned,
                'Final_Text': processed,
                'Label_with_Code': file_label,
                'word_count': len(processed.split())
            }

            all_records.append(record)

    return pd.DataFrame(all_records)

# ============================
# Step 3: Execute the Pipeline
# ============================
excel_files = [f'/content/drive/My Drive/Colab/AS4/dataset-{i}.xlsx' for i in range(1, 6)]
file_labels = [
    'gene expression analysis',
    'sequence classification and alignment',
    'protein structure and function prediction',
    'biological image analysis',
    'disease outcome prediction'
]

# ============================
# Step 4: Export full labeled dataset
# ============================
df_labeled = prepare_labeled_dataset(excel_files, file_labels)
output_path_full = '/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/Validation_Metadata.xlsx'
df_labeled.to_excel(output_path_full, index=False)
print(f"Metadata & Processed Dataset Saved: {output_path_full}")

# ============================
# Step 5: Export Cleaned Final Text + Label
# ============================
df_final_export = df_labeled[['Final_Text', 'Label_with_Code']].rename(columns={
    'Final_Text': 'Text',
    'Label_with_Code': 'Label'
})

output_path_final = '/content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx'
df_final_export.to_excel(output_path_final, index=False)
print(f"Exported Cleaned Dataset: {output_path_final}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Metadata & Processed Dataset Saved: /content/drive/My Drive/Colab/AS4/STEP1-data_prepare/Validation_Metadata.xlsx
Exported Cleaned Dataset: /content/drive/My Drive/Colab/AS4/STEP1-data_prepare/final_labeled_dataset.xlsx
