In [5]:
pip install xlrd>=2.0.1

Note: you may need to restart the kernel to use updated packages.


In [7]:

import pandas as pd
import random

# Step 1: Split a long string of text into 200 random 100-word chunks
def split_text_into_partitions(text, partition_size=100, num_partitions=200):
    words = text.split()  # split by whitespace, keep all tokens
    partitions = []

    for _ in range(num_partitions):
        start_idx = random.randint(0, max(len(words) - partition_size, 0))
        partition_words = words[start_idx : start_idx + partition_size]
        partition_text = ' '.join(partition_words)
        partitions.append(partition_text)
        
    return partitions


# Step 2: Load 5 Excel files, create 200 chunks from each, and label them Aâ€“E
def create_labeled_dataframe_from_excels(excel_paths, labels):
    all_partitions = []

    for path, label in zip(excel_paths, labels):
        df = pd.read_excel(path)

        # Combine Title + Abstract into a single large string
        combined_text = ' '.join(
            (df['Article Title'].astype(str) + ' ' + df['Abstract'].astype(str)).tolist()
        )

        # Split into 200 chunks of 100 words
        partitions = split_text_into_partitions(combined_text)

        # Append each partition with its label
        for part in partitions:
            all_partitions.append({'text': part, 'label': label})
    
    return pd.DataFrame(all_partitions)


# Step 3: Call the function on your files
excel_files = [f'dataset-{i}.xls' for i in range(1, 6)]
labels = ['gene expression analysis', 'sequence classification and alignment', 
          'protein structure and function prediction', 'biological image analysis', 'disease outcome prediction']

df_labeled = create_labeled_dataframe_from_excels(excel_files, labels)

# Optional: Save to CSV
df_labeled.to_csv('sampled_labeled_partitions.csv', index=False)

print("Done! Saved 200 x 5 = 1000 labeled 100-word records.")


Done! Saved 200 x 5 = 1000 labeled 100-word records.


In [9]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (only needs to be run once)
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Set up preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Define the cleaning function
def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Keep only letters and spaces
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

# Load the sampled labeled CSV
df = pd.read_csv('sampled_labeled_partitions.csv')

# Clean the 'text' column
df['text'] = df['text'].apply(clean_text)

# Save to CSV
df.to_csv('cleaned_labeled_partitions.csv', index=False)

# Save to JSON
df.to_json('cleaned_labeled_partitions.json', orient='records', lines=True)

print(" 'cleaned_labeled_partitions.csv' and 'cleaned_labeled_partitions.json'")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Cella.Shang\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Cella.Shang\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Cella.Shang\AppData\Roaming\nltk_data...


 'cleaned_labeled_partitions.csv' and 'cleaned_labeled_partitions.json'
