In [1]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
automobile = fetch_ucirepo(id=10)

# data (as pandas dataframes)
X = automobile.data.features
y = automobile.data.targets

# metadata
print(automobile.metadata)

# variable information
print(automobile.variables)


{'uci_id': 10, 'name': 'Automobile', 'repository_url': 'https://archive.ics.uci.edu/dataset/10/automobile', 'data_url': 'https://archive.ics.uci.edu/static/public/10/data.csv', 'abstract': "From 1985 Ward's Automotive Yearbook", 'area': 'Other', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 205, 'num_features': 25, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': [], 'target_col': ['symboling'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1985, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5B01C', 'creators': ['Jeffrey Schlimmer'], 'intro_paper': None, 'additional_info': {'summary': 'This data set consists of three types of entities: (a) the specification of an auto in terms of various characteristics, (b) its assigned insurance risk rating, (c) its normalized losses in use as compared to other cars.  The second rating corresponds to the degree to which th

In [3]:
y.head()

Unnamed: 0,symboling
0,3
1,3
2,1
3,2
4,2


In [4]:
X.head()

Unnamed: 0,price,highway-mpg,city-mpg,peak-rpm,horsepower,compression-ratio,stroke,bore,fuel-system,engine-size,...,length,wheel-base,engine-location,drive-wheels,body-style,num-of-doors,aspiration,fuel-type,make,normalized-losses
0,13495.0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,...,168.8,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,
1,16500.0,27,21,5000.0,111.0,9.0,2.68,3.47,mpfi,130,...,168.8,88.6,front,rwd,convertible,2.0,std,gas,alfa-romero,
2,16500.0,26,19,5000.0,154.0,9.0,3.47,2.68,mpfi,152,...,171.2,94.5,front,rwd,hatchback,2.0,std,gas,alfa-romero,
3,13950.0,30,24,5500.0,102.0,10.0,3.4,3.19,mpfi,109,...,176.6,99.8,front,fwd,sedan,4.0,std,gas,audi,164.0
4,17450.0,22,18,5500.0,115.0,8.0,3.4,3.19,mpfi,136,...,176.6,99.4,front,4wd,sedan,4.0,std,gas,audi,164.0


In [5]:
X.isna().sum()

Unnamed: 0,0
price,4
highway-mpg,0
city-mpg,0
peak-rpm,2
horsepower,2
compression-ratio,0
stroke,4
bore,4
fuel-system,0
engine-size,0


In [6]:
x_data = X.drop(labels='normalized-losses', axis=1)

In [7]:
x_data.isna().sum()

Unnamed: 0,0
price,4
highway-mpg,0
city-mpg,0
peak-rpm,2
horsepower,2
compression-ratio,0
stroke,4
bore,4
fuel-system,0
engine-size,0


In [8]:
x_data.dtypes

Unnamed: 0,0
price,float64
highway-mpg,int64
city-mpg,int64
peak-rpm,float64
horsepower,float64
compression-ratio,float64
stroke,float64
bore,float64
fuel-system,object
engine-size,int64


In [9]:
x_data.fillna(method='ffill', inplace=True)

  x_data.fillna(method='ffill', inplace=True)


In [10]:
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder instance
label_encoder = LabelEncoder()

# Apply Label Encoding to all object columns
x_data[x_data.select_dtypes(include=['object']).columns] = x_data.select_dtypes(include=['object']).apply(lambda col: label_encoder.fit_transform(col))

In [11]:
def detect_outliers_iqr(data):
    Q1 = data.quantile(0.25)
    Q3 = data.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = (data < lower_bound) | (data > upper_bound)
    return outliers

# Detect outliers in x_data
outlier_mask = detect_outliers_iqr(x_data)
outlier_indices = outlier_mask.any(axis=1)

# Eliminate outliers from both x_data and y
x_data_no_outliers = x_data[~outlier_indices]
y_no_outliers = y[~outlier_indices]  # Remove corresponding rows from y

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score

# Decision Tree Classifier
X_train, X_test, y_train, y_test = train_test_split(x_data_no_outliers, y_no_outliers, test_size=0.2, random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)  # Predictions on test set for initial accuracy
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {accuracy_dt}")

# SVM with Manual Hyperparameter Tuning and Scaling

# StandardScaler with Manual Tuning
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

svm_model = SVC(C=10, kernel='rbf', gamma='scale', random_state=42)
svm_model.fit(X_train_scaled, dt_model.predict(X_train))
y_pred_svm = svm_model.predict(X_test_scaled)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f"Improved SVM Accuracy (StandardScaler + Manual Tuning): {accuracy_svm}")

Decision Tree Accuracy: 0.75
Improved SVM Accuracy (StandardScaler + Manual Tuning): 0.8
