In [3]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import pickle

# Load Data

In [4]:
def load_vectors(pos_file, neg_file):
    """
    Load positive and negative vectors, combine and shuffle them.
    
    Parameters:
    pos_file: str
        Path to file containing positive vectors.
    neg_file: str
        Path to file containing negative vectors.
    
    Returns:
    X: np.array
        Combined and shuffled vectors.
    y: np.array
        Labels for the vectors.
    """
    # Read files
    pos_vectors = np.loadtxt(pos_file)
    neg_vectors = np.loadtxt(neg_file)
    
    # Create labels
    pos_labels = np.ones(len(pos_vectors))
    print(pos_labels)
    neg_labels = -np.ones(len(neg_vectors))
    print(neg_labels)
    
    # Combine data
    X = np.vstack((pos_vectors, neg_vectors))
    y = np.concatenate((pos_labels, neg_labels))
    
    # Shuffle
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    
    return X[indices], y[indices]

In [5]:
X, y = load_vectors("data/twitter-datasets/train_pos_full_embedding.txt", "data/twitter-datasets/train_neg_full_embedding.txt")

[1. 1. 1. ... 1. 1. 1.]
[-1. -1. -1. ... -1. -1. -1.]


In [6]:
print(X.shape, y.shape)
print(X[0], y[0])

(2500000, 20) (2500000,)
[ 1.35861969  0.61186035  0.28082677 -0.55846596 -0.0632965  -0.46061267
  0.14617156  0.5887694  -0.19048877 -0.50327772  1.15450673  0.5642846
  0.07483755 -0.19869685  0.51805835  0.30517758 -0.3693509   0.54081395
 -0.08717991 -0.40615111] 1.0


In [7]:
def train_evaluate_svm(X, y, test_size=0.2, random_state=42):
    """
    Train and evaluate SVM classifier with RBF kernel
    
    Args:
        X: np.array
            Array of tweet vectors
        y: np.array
            Labels (-1 for negative tweet / 1 for positive tweet)
        test_size: float
            Fraction of data to use for testing (default 80/20 split)
        random_state: int
            Random seed for reproducibility (default 42)
    
    Returns:
        model: SVC
            Trained SVM model
        accuracy: float
            Test set accuracy
    """
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Initialize and train model
    model = SVC(kernel='rbf', random_state=random_state, verbose=True)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))
    
    return model, accuracy

In [8]:
def predict_sentiment(model, tweet_vector):
    """
    Predict sentiment for a single tweet vector using the model
    
    Args:
        model: SVC
            Trained SVM model
        tweet_vector: np.array
            Vector representation of tweet
    
    Returns:
        prediction: int
            Predicted class (0 or 1)
    """
    # Reshape for single sample prediction
    tweet_vector = tweet_vector.reshape(1, -1)
    return model.predict(tweet_vector)[0]

# Train Model

In [9]:
model, accuracy = train_evaluate_svm(X, y)

[LibSVM]..........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

# Test Model