In [2]:
import os
import pathway as pw
import pdfplumber
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from google.colab import drive

# Step 1: Mount Google Drive
drive.mount('/content/drive')

data_dir = "/content/drive/My Drive/references"  # Path to Google Drive dataset directory
publishable_dir = os.path.join(data_dir, "publishable")
non_publishable_dir = os.path.join(data_dir, "non-publishable")

# Step 2: Define File Parsing Logic
def parse_content(file_path):
    if file_path.endswith('.pdf'):
        with pdfplumber.open(file_path) as pdf:
            return "\n".join([page.extract_text() for page in pdf.pages])
    return ""  # Handle unsupported formats

# Step 3: Load Data into Pathway Table
# Load publishable papers
publishable_files = [(file, os.path.join(publishable_dir, file), 1) for file in os.listdir(publishable_dir) if file.endswith('.pdf')]

# Load non-publishable papers
non_publishable_files = [(file, os.path.join(non_publishable_dir, file), 0) for file in os.listdir(non_publishable_dir) if file.endswith('.pdf')]

# Combine data
file_records = publishable_files + non_publishable_files

file_table = pw.Table.from_records(
    file_records,
    schema=["file_name", "file_path", "label"]
)

# Parse files and add content to the table
parsed_table = file_table.map_rows(
    lambda row: {
        "file_name": row["file_name"],
        "content": parse_content(row["file_path"]),
        "label": row["label"],
    }
)

# Step 4: Train a Classification Model
# Collect labeled data for training
texts = [row["content"] for row in parsed_table.collect_rows()]
labels = [row["label"] for row in parsed_table.collect_rows()]

# Convert texts to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X = vectorizer.fit_transform(texts)
y = labels

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
print(f"Accuracy: {accuracy:.2f}")
print(f"F1 Score: {f1:.2f}")

# Step 5: Apply Model to Unlabeled Data
# Predict labels for all parsed data (useful for validation)
predicted_labels = model.predict(vectorizer.transform(texts))

# Add predictions back to the table
predicted_table = parsed_table.map_rows(
    lambda row, pred=predicted_labels: {
        "file_name": row["file_name"],
        "content": row["content"],
        "true_label": row["label"],
        "predicted_label": pred.pop(0),
    }
)

# Step 6: Save Results
output_table = predicted_table.map_rows(
    lambda row: {
        "Paper ID": row["file_name"],
        "True Label": row["true_label"],
        "Predicted Label": row["predicted_label"],
    }
)

pw.io.write_csv(output_table, "results.csv")

print("Pipeline executed successfully. Results saved to results.csv.")


This is not the real Pathway package.
Visit https://pathway.com/developers/ to get Pathway.
Already tried that? Visit https://pathway.com/troubleshooting/ to get help.
Note: your platform is Windows-10-10.0.22631-SP0, your Python is CPython 3.10.16.


ModuleNotFoundError: No module named 'google.colab'