In [1]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import cvxpy as cp

# Load a smaller subset of the 20 Newsgroups dataset
categories = ['comp.graphics', 'talk.politics.misc']
newsgroups = fetch_20newsgroups(subset='all', categories=categories, remove=('headers', 'footers', 'quotes'))

# Print some text samples
print("Text samples from the dataset subset:")
for i in range(2):
    print(f"\nCategory: {newsgroups.target_names[newsgroups.target[i]]}")
    print(newsgroups.data[i])

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_tfidf.toarray())
X_test_scaled = scaler.transform(X_test_tfidf.toarray())

# Proximal SVM for text classification using cvxpy
n, d = X_train_scaled.shape

# Define variables
w = cp.Variable(d)
b = cp.Variable()
C = 1.0  # Regularization parameter

# Define the proximal SVM problem using cvxpy
y_binary = 2 * y_train - 1  # Convert labels to -1 and 1
loss = cp.sum(cp.pos(1 - cp.multiply(y_binary, X_train_scaled @ w - b))) + 0.5 * cp.norm(w, 2) ** 2
regularization = C * cp.norm(w, 1)  # L1 regularization
problem = cp.Problem(cp.Minimize(loss + regularization))

# Solve the problem
problem.solve()

# Predict on the test set
y_pred = np.sign(X_test_scaled @ w.value - b.value)

# Evaluate the performance
accuracy = metrics.accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")


ModuleNotFoundError: No module named 'cvxpy'