After loading the dataset in from our we can begin creating the SVM pipeline to asses it's performance.

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def preprocess_data(json_file):
    # Load JSON data
    data = pd.read_json(json_file)

    # Create DataFrame
    df = pd.DataFrame(data)

    # Rename columns for better readability
    df.rename(columns={'Title': 'Title',
                       'str: Context': 'Context',
                       'Body': 'Body',
                       'Comment': 'Comment',
                       'L1: Type': 'L1_Type',
                       'L2: Abusiveness': 'L2_Abusiveness',
                       'L3: Target': 'L3_Target',
                       'L4: Demographic Characteristics': 'L4_Demographic_Characteristics',
                       'L5: Implicitness': 'L5_Implicitness',
                       'L6: Profanity': 'L6_Profanity'}, inplace=True)

    # Drop unnecessary columns
    df.drop(columns=['Title', 'Context', 'Body', 'L1_Type', 'L3_Target', 'L4_Demographic_Characteristics', 'L5_Implicitness'], inplace=True)

    # Convert categorical columns to numeric using one-hot encoding
    df = pd.get_dummies(df, columns=['L2_Abusiveness', 'L6_Profanity'])

    # Split data into features and labels
    X = df.drop(columns=['Comment'])
    y = df['Comment']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Feature extraction using CountVectorizer
    vectorizer = CountVectorizer()
    X_train = vectorizer.fit_transform(X_train['Comment'])
    X_test = vectorizer.transform(X_test['Comment'])

    return X_train, X_test, y_train, y_test


def train_svm(X_train, X_test, y_train, y_test):
    # Create SVM classifier
    svm = SVC(kernel='linear')

    # Train SVM classifier
    svm.fit(X_train, y_train)

    # Make predictions
    y_pred = svm.predict(X_test)

    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    return accuracy, precision, recall, f1


# Example usage
json_file = 'Dataset/train.json'
X_train, X_test, y_train, y_test = preprocess_data(json_file)
accuracy, precision, recall, f1 = train_svm(X_train, X_test, y_train, y_test)
print("Accuracy: {:.2f}".format(accuracy))
print("Precision: {:.2f}".format(precision))
print("Recall: {:.2f}".format(recall))
print("F1-score: {:.2f}".format(f1))


KeyError: "['Title', 'Context', 'Body', 'L1_Type', 'L3_Target', 'L4_Demographic_Characteristics', 'L5_Implicitness'] not found in axis"