In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import joblib
import ast

# Load the dataset (assuming you've downloaded it from Kaggle)
# Dataset: https://www.kaggle.com/datasets/spsayakpaul/arxiv-paper-abstracts?select=arxiv_data_210930-054931.csv
df = pd.read_csv('arxiv_data_210930-054931.csv')

df['terms'] = df['terms'].apply(ast.literal_eval)

# Display basic info
print(f"Dataset shape: {df.shape}")
print("Columns:", df.columns.tolist())
print("\nSample titles:")
print(df['titles'].head())

# Define our target topics and map arXiv categories to them
topic_mapping = {
    'cs.AI': 'AI',
    'cs.LG': 'ML',
    'cs.CL': 'AI',  # Computational Linguistics → AI
    'cs.NE': 'Reinforcement Learning',  # Neural and Evolutionary Computing
    'q-bio.QM': 'Medicine',  # Quantitative Methods in Biology
    'q-bio.NC': 'Medicine',  # Neurons and Cognition
    'physics.class-ph': 'Physics of motion',  # Classical Physics
    'stat.ML': 'ML'  # Statistics → Machine Learning
}
def map_to_topic(terms):
    for term in terms:
        if term in topic_mapping:
            return topic_mapping[term]
    return None
# Create our target variable by mapping categories

df['topic'] = df['terms'].apply(map_to_topic)
print(df['topic'].head(5))

# Filter only the rows with our desired topics
df = df[df['topic'].notna()]


# Check class distribution
print("\nClass distribution:")
print(df['topic'].value_counts())

# Balance the classes (optional but recommended)
min_samples = min(df['topic'].value_counts())
df = df.groupby('topic').apply(lambda x: x.sample(min_samples)).reset_index(drop=True)

# Preprocess text data
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    # Simple preprocessing: lowercase and remove special chars
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text

df['processed_title'] = df['titles'].apply(preprocess_text)

# Split data
X = df['processed_title']
y = df['topic']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Create and train the model pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 2),  # Use unigrams and bigrams
        max_features=10000,
        min_df=5,
        max_df=0.7
    )),
    ('clf', LogisticRegression(
        max_iter=1000,
        class_weight='balanced',
        multi_class='multinomial',
        solver='saga',
        penalty='elasticnet',
        l1_ratio=0.5  # Mix of L1 and L2 regularization
    ))
])

# Train model
print("\nTraining model...")
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Save the model
joblib.dump(model, 'arxiv_title_classifier.pkl')
print("\nModel saved as 'arxiv_title_classifier.pkl'")


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [36]:
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import learning_curve

# 1. Plot Learning Curve
def plot_learning_curve(estimator, title, X, y, cv=5):
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 5),
        scoring='accuracy'
    )
    
    plt.figure(figsize=(10, 6))
    plt.title(title)
    plt.xlabel("Training examples")
    plt.ylabel("Accuracy")
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

plot_learning_curve(model, "Learning Curve", X, y)

# 2. Plot Validation Curve for different regularization strengths
from sklearn.model_selection import validation_curve

param_range = np.logspace(-4, 4, 10)
train_scores, test_scores = validation_curve(
    model.named_steps['clf'], 
    model.named_steps['tfidf'].transform(X),
    y,
    param_name="C",
    param_range=param_range,
    cv=5,
    scoring="accuracy",
    n_jobs=-1
)

plt.figure(figsize=(10, 6))
plt.title("Validation Curve with Logistic Regression")
plt.xlabel("Regularization Strength (C)")
plt.ylabel("Accuracy")
plt.ylim(0.0, 1.1)
lw = 2

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.semilogx(param_range, train_scores_mean, label="Training score",
             color="darkorange", lw=lw)
plt.fill_between(param_range, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.2,
                 color="darkorange", lw=lw)
plt.semilogx(param_range, test_scores_mean, label="Cross-validation score",
             color="navy", lw=lw)
plt.fill_between(param_range, test_scores_mean - test_scores_std,
                 test_scores_mean + test_scores_std, alpha=0.2,
                 color="navy", lw=lw)
plt.legend(loc="best")
plt.show()

# 3. Plot Confusion Matrix
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

y_pred = model.predict(X_test)
cm = confusion_matrix(y_test, y_pred, labels=model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
disp.plot(cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.xticks(rotation=45)
plt.show()

# 4. Plot Accuracy by Class
from sklearn.metrics import precision_recall_fscore_support

precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, labels=model.classes_)
accuracy = np.diag(cm) / np.sum(cm, axis=1)

plt.figure(figsize=(10, 6))
plt.bar(model.classes_, accuracy)
plt.title("Accuracy by Class")
plt

BrokenProcessPool: A task has failed to un-serialize. Please ensure that the arguments of the function are all picklable.