In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from scipy.sparse import hstack

In [2]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status", 
                "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", 
                "hours-per-week", "native-country", "income"]
df = pd.read_csv(url, names=column_names)

In [3]:
# Encode the target variable
label_encoder = LabelEncoder()
df['income'] = label_encoder.fit_transform(df['income'])

In [4]:
# Separate features and target variable
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Perform one-hot encoding on the categorical features
categorical_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country']
ct = ColumnTransformer([('encoder', OneHotEncoder(), categorical_features)], remainder='passthrough')
X = ct.fit_transform(X)

In [5]:
# Combine the numerical and categorical features
X_num = X[:, :6]
X_cat = X[:, 6:]
X_combined = hstack((X_num, X_cat))

# Scale the data using StandardScaler
sc = StandardScaler(with_mean=False)
X = sc.fit_transform(X_combined)

In [6]:
df.shape

(32561, 15)

In [9]:
import numpy as np
from sklearn.svm import SVC
kernel_list = ['linear', 'poly', 'rbf', 'sigmoid']

# Split the dataset into 70-30 train-test split with 10 different samples
samples = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=i)
    samples.append((X_train, X_test, y_train, y_test))

results = []
for i, (X_train, X_test, y_train, y_test) in enumerate(samples):
    # Generate random values of C and gamma 100 times
    C_list = np.random.uniform(0, 1, size=100)
    gamma_list = np.random.uniform(0, 1, size=100)

    # Try different kernels with random values of C and gamma and record the best one
    best_kernel = ""
    best_acc = 0
    best_params = {}
    for j in range(100):
        # Train an SVM with the random values of C, gamma, and kernel
        C = C_list[j]
        gamma = gamma_list[j]
        kernel = np.random.choice(kernel_list)
        svm = SVC(C=C, gamma=gamma, kernel=kernel, max_iter=100)
        svm.fit(X_train, y_train)

        # Calculate the accuracy on the test set
        acc = svm.score(X_test, y_test)

        # Record the random values of C, gamma, kernel and the accuracy
        if acc > best_acc:
            best_kernel = kernel
            best_acc = acc
            best_params = {"C": C, "gamma": gamma, "kernel": kernel}

    # Record the best kernel, random values of C and gamma, and the accuracy
    results.append((i+1, best_acc, best_params))
    print(f"Sample {i+1}: accuracy = {best_acc:.3f}, C = {best_params['C']:.3f}, gamma = {best_params['gamma']:.3f}, kernel = {best_params['kernel']}")

# Create a pandas dataframe to display the results
df_results = pd.DataFrame(results, columns=["Sample No.", "Best Accuracy", "Best Parameters"])



Sample 1: accuracy = 0.799, C = 0.046, gamma = 0.220, kernel = poly




Sample 2: accuracy = 0.809, C = 0.474, gamma = 0.100, kernel = poly




Sample 3: accuracy = 0.796, C = 0.530, gamma = 0.180, kernel = poly




Sample 4: accuracy = 0.773, C = 0.928, gamma = 0.039, kernel = rbf




Sample 5: accuracy = 0.767, C = 0.624, gamma = 0.756, kernel = rbf




Sample 6: accuracy = 0.763, C = 0.928, gamma = 0.319, kernel = rbf




Sample 7: accuracy = 0.796, C = 0.214, gamma = 0.101, kernel = poly




Sample 8: accuracy = 0.767, C = 0.363, gamma = 0.950, kernel = rbf




Sample 9: accuracy = 0.760, C = 0.632, gamma = 0.987, kernel = rbf




Sample 10: accuracy = 0.765, C = 0.501, gamma = 0.036, kernel = rbf




In [10]:
import matplotlib.pyplot as plt

best_sample = max(results, key=lambda x: x[1])[0]
print(f"Best sample: {best_sample}")

# Train an SVM on the best sample with different values of max_iter and record the accuracy
max_iters = list(range(1, 101))
accuracies = []
for max_iter in max_iters:
    svm = SVC(C=results[best_sample-1][2]["C"], gamma=results[best_sample-1][2]["gamma"], kernel=results[best_sample-1][2]["kernel"], max_iter=max_iter)
    svm.fit(samples[best_sample-1][0], samples[best_sample-1][2])
    acc = svm.score(samples[best_sample-1][1], samples[best_sample-1][3])
    accuracies.append(acc)

# Plot a graph of accuracy vs iterations
plt.plot(max_iters, accuracies)
plt.xlabel("Iterations")
plt.ylabel("Accuracy")
plt.show()


Best sample: 2




AttributeError: module 'matplotlib.cbook' has no attribute '_safe_first_finite'

<Figure size 640x480 with 0 Axes>