In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import os

# Example dataset
data_dir = '/projappl/project_2006600/fin_experiment/data'
data_combined_news = pd.read_csv(os.path.join(data_dir, 'data_combined_news.csv'), sep='\t', encoding='utf-8')

x = data_combined_news['All_news']
y = data_combined_news['Label']

X_train, X_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=True)

# Split data

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_valid)

# Train SVM
svm = SVC(kernel="linear", C=1.0, random_state=42)
svm.fit(X_train_tfidf, y_train)

# Predictions
y_pred = svm.predict(X_test_tfidf)

# Evaluation
accuracy = accuracy_score(y_valid, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print(classification_report(y_valid, y_pred, target_names=["Negative", "Positive"]))


Accuracy: 0.5101
              precision    recall  f1-score   support

    Negative       0.42      0.35      0.38       171
    Positive       0.56      0.63      0.59       227

    accuracy                           0.51       398
   macro avg       0.49      0.49      0.49       398
weighted avg       0.50      0.51      0.50       398

