In [None]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC  
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import numpy as np

In [2]:
# Load the data 
data = pd.read_csv('all_car_adverts.csv', header=0)

In [3]:
#checking whats in the dataset
print(data.head())
print(data.info())
print(data.describe())

   Unnamed: 0 make  model variant  car_price car_badges car_title  \
0           0   AC  Cobra     NaN    89995.0        NaN  AC Cobra   
1           1   AC  Cobra     NaN    92500.0         ''  AC Cobra   
2           2   AC  Cobra     NaN   109995.0         ''  AC Cobra   
3           3   AC  Cobra     NaN   124950.0         ''  AC Cobra   
4           4   AC  Cobra     NaN   124950.0         ''  AC Cobra   

                                       car_sub_title  \
0                                  4.9 MK IV CRS 2dr   
1                                         378 - MkIV   
2  MK1V 212 SC. 3.5 V8 350 BHP LOTUS TWIN TURBO. ...   
3                                       302 MKIV 2dr   
4   302 MKIV With Factory Lightweight Engine 5.0 2dr   

          car_attention_grabber  \
0  GENUINE AC COBRA CRS 522 BHP   
1                 PHYSICAL CAR!   
2       FULL CARBON FIBRE BODY.   
3           ABSOLUTELY STUNNING   
4          'STAGE 3' SVO ENGINE   

                                     

In [4]:
# Define transformers for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [None]:

target_column = 'car_price' 
X = data.drop(target_column, axis=1)
y = data[target_column]

In [6]:
# Identify numeric and categorical columns automatically
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [7]:
# Create the updated preprocessor with imputers included
preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [8]:
# Create the SVM pipeline with preprocessing
pipeline = Pipeline(steps=[
    ('pre', preprocessor),
    ('svm', SVC(verbose=True))
])

In [None]:
# Splitting the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

[LibSVM]