In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Load the dataset
data = pd.read_csv("merged_output.csv", low_memory=False)

# Convert 'ServiceType' column to categorical values
data['ServiceType'] = data['ServiceType'].astype('category')

# Select features for classification
features = ['OIPID', 'SupplierID', 'Country', 'InstitutionName']
target = 'ServiceType'

# Drop rows with missing target or features
data_clean = data.dropna(subset=[target] + features)

# Splitting into features (X) and target (y)
X = data_clean[features]
y = data_clean[target]

# Handling missing values and encoding categorical variables
categorical_features = ['OIPID', 'SupplierID', 'Country', 'InstitutionName']

# Preprocessing pipeline: Imputation for missing values and One-Hot Encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Define the SVM Classifier pipeline
classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', SVC(kernel='linear', random_state=42))
])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the model performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8706042010856738


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
                                                    precision    recall  f1-score   support

                                           Anthos       0.00      0.00      0.00         1
Apache Kafka® & Apache Flink® on Confluent Cloud™       0.00      0.00      0.00         1
                Apache Kafka® on Confluent Cloud™       0.00      0.00      0.00        13
                                       App Engine       0.31      0.13      0.18        39
                                        Appliance       0.00      0.00      0.00        24
                                     Applications       0.00      0.00      0.00        18
                        Article for Azure Overage       0.90      0.99      0.94      3740
                                Artifact Registry       0.00      0.00      0.00        34
                                     Azure Commit       0.87      0.27      0.41       196
                                    Azure Overage       0.92    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
