# Supervised Learning Coursework 1 Coding Part

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

##### Initialise condition for question a

In [None]:
data_x = np.array([1, 2, 3, 4])
data_y = np.array([3, 2, 0, 5])

### Part 1

#### Question 1

In [None]:

def extract_features(x, degree):
    return np.array([x**i for i in range(degree + 1)]).T

def linear_regression(X, y):
    return np.linalg.solve(X.T @ X, X.T @ y)

##### Question 1a

In [None]:
plt.figure(figsize=(10, 6))
x_plot = np.linspace(0, 5, 500)

for degree in range(0, 4):
    X = extract_features(data_x, degree)
    coeffs = linear_regression(X, data_y).round(2)
    X_plot = extract_features(x_plot, degree)
    y_plot = X_plot @ coeffs
    plt.plot(x_plot, y_plot, label=f'k={degree}')

plt.scatter(data_x, data_y, color='red', label='Data Points')
plt.xlabel('x')
plt.ylabel('y')
plt.ylim(-5, 10)
plt.title('Polynomial Fits for Different Degrees')
plt.legend()
plt.grid(True)
ax = plt.gca()
ax.spines['bottom'].set_position(('data', 0))
ax.spines['left'].set_position(('data', 0))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()

##### Question 1b

In [None]:
for degree in range(0, 4):
    X = extract_features(data_x, degree)
    coeffs = linear_regression(X, data_y).round(2)
    print(f"Degree {degree+1} polynomial coefficients: {coeffs}")

##### Question 1c

In [None]:
for degree in range(0, 4):
    X = extract_features(data_x, degree)
    coeffs = linear_regression(X, data_y)
    coeffs = coeffs.round(2)
    y_pred = X @ coeffs
    print(y_pred)
    mse = np.mean((data_y - y_pred) ** 2)
    print(f"MSE for k={degree}: {mse}")

#### Question 2

##### Question 2a i

In [None]:
def g_sigma(x, sigma):
    noise = np.random.normal(0, sigma, size=len(x))
    return np.sin(2 * np.pi * x)**2 + noise

# 生成数据并绘图
np.random.seed(0)
x_sample = np.random.uniform(0,1,30)
y_sample = g_sigma(x_sample, sigma=0.07)

x_plot = np.linspace(0, 1, 500)
y_plot = np.sin(2 * np.pi * x_plot)**2

plt.figure(figsize=(10, 6))
plt.plot(x_plot, y_plot, label='sin^2(2πx)')
plt.scatter(x_sample, y_sample, color='red', label='Noisy Data Points')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Function with Noise')
plt.legend()
plt.grid(True)
plt.show()

##### Question 2a ii

In [None]:
degrees = [2, 5, 10, 14, 18]
plt.figure(figsize=(10, 6))
x_plot = np.linspace(0, 1, 500)
for degree in degrees:
    X = extract_features(x_sample, degree-1)
    coeffs = linear_regression(X, y_sample)
    X_plot = extract_features(x_plot, degree-1)
    y_plot = X_plot @ coeffs
    print(coeffs)
    plt.plot(x_plot, y_plot, label=f'k={degree}')


plt.scatter(x_sample, y_sample, color='red', label='Data Points')
plt.xlabel('x')
plt.ylabel('y')
plt.title('Polynomial Fits for Different Degrees')
plt.legend()
plt.grid(True)
plt.ylim(-1.5, 1.5)
ax = plt.gca()
ax.spines['bottom'].set_position(('data', 0))
ax.spines['left'].set_position(('data', 0))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.show()


##### Question 2b

In [None]:
training_errors = []
degrees = [i for i in range(1,19)]
for degree in range(0,18):
    X = extract_features(x_sample, degree)
    coeffs = linear_regression(X, y_sample)
    y_pred = X @ coeffs
    mse = np.mean((y_sample-y_pred)**2)
    training_errors.append(mse)
plt.figure(figsize=(10, 6))
plt.plot(degrees, np.log(training_errors), marker='o')
plt.xlabel('Polynomial Dimension (k)')
plt.ylabel('ln(MSE)')
plt.title('Log Training Error vs Polynomial Dimension')
plt.grid(True)
plt.show()
print(training_errors)

##### Question 2c

In [None]:
np.random.seed(0)
x_test = np.random.uniform(0,1,1000)
y_test = g_sigma(x_test, sigma=0.07)

testing_errors = []
degrees = [i for i in range(1,19)]
for degree in range(0,18):
    X = extract_features(x_sample, degree)
    coeffs = linear_regression(X, y_sample)
    X = extract_features(x_test, degree)
    y_pred = X @ coeffs
    mse = np.mean((y_test-y_pred)**2)
    testing_errors.append(mse)
for i in testing_errors:
    print(i)

plt.figure(figsize=(10, 6))
plt.plot(degrees, np.log(testing_errors), marker='o')
plt.xlabel('Polynomial Dimension (k)')
plt.ylabel('ln(MSE)')
plt.title('Log Testing Error vs Polynomial Dimension')
plt.grid(True)
plt.show()




##### Question 2d

In [None]:
total_testing_errors = [0 for i in range(18)]
for experiment in range(100):
    np.random.seed(experiment)
    sub_x_sample = np.random.uniform(0,1,30)
    sub_y_sample = g_sigma(sub_x_sample, sigma=0.07)
    for degree in range(0,18):
        X = extract_features(sub_x_sample, degree)
        coeffs = linear_regression(X, sub_y_sample)
        X = extract_features(x_test, degree)
        y_pred = X @ coeffs
        mse = np.mean((y_test-y_pred)**2)
        testing_errors.append(mse)
    total_testing_errors  = [x + y for x, y in zip(total_testing_errors, testing_errors)]
average_testing_errors = [x/100 for x in total_testing_errors]
plt.figure(figsize=(10, 6))
for i in average_testing_errors:
    print(i)
plt.plot([i for i in range(1,19)], np.log(average_testing_errors), marker='o')
plt.xlabel('Polynomial Dimension (k)')
plt.ylabel('ln(avg MSE)')
plt.title('Log Average Testing Error vs Polynomial Dimension')
plt.grid(True)
plt.show()

#### Question 3


In [None]:
def extract_features_trig(x, degree):
    return np.column_stack([np.sin((i + 1) * np.pi * x) for i in range(degree)])
np.random.seed(0)


x_test = np.random.uniform(0, 1, 1000)
y_test = g_sigma(x_test, sigma=0.07)


n_experiments = 100 
degrees = [i for i in range(1, 19)]  

x_sample = np.random.uniform(0, 1, 50)
y_sample = g_sigma(x_sample, sigma=0.07)

testing_errors = []

for degree in degrees:
    X_train = extract_features_trig(x_sample, degree)
    coeffs = linear_regression(X_train, y_sample)
    
    X_test = extract_features_trig(x_test, degree)
    y_pred = X_test @ coeffs
    mse = np.mean((y_test - y_pred) ** 2)
    testing_errors.append(mse)

plt.figure(figsize=(10, 6))
plt.plot(degrees, np.log(testing_errors), marker='o')
plt.xlabel('Basis Dimension (k)')
plt.ylabel('ln(MSE)')
plt.title('Log Testing Error vs Basis Dimension (Trig Basis)')
plt.grid(True)
plt.show()


testing_errors_all = []

for experiment in range(n_experiments):
   
    x_sample = np.random.uniform(0, 1, 50)
    y_sample = g_sigma(x_sample, sigma=0.07)
  
    testing_errors = []
    
    for degree in degrees:
       
        X_train = extract_features_trig(x_sample, degree)
        coeffs = linear_regression(X_train, y_sample)
        
        # 提取测试特征并计算误差
        X_test = extract_features_trig(x_test, degree)
        y_pred = X_test @ coeffs
        mse = np.mean((y_test - y_pred) ** 2)
        testing_errors.append(mse)
    
    =
    testing_errors_all.append(testing_errors)

average_testing_errors = np.mean(testing_errors_all, axis=0)


plt.figure(figsize=(10, 6))
plt.plot(degrees, np.log(average_testing_errors), marker='o')
plt.xlabel('Basis Dimension (k)')
plt.ylabel('ln(avg MSE)')
plt.title('Log Average Testing Error vs Basis Dimension (Trig Basis)')
plt.grid(True)
plt.show()

# PART 2 

## 2.1 K-Nearest Neighbors

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

### 2.1.1 Generating the Data

In [None]:
np.random.seed(77)
S = 100
X = np.random.rand(S, 2)
y = np.random.choice([0, 1], size = S)

def h_S_v(x_S, y_S, x, v):
    distances = np.linalg.norm(x_S - x, axis=1)
    nearest_indices = np.argsort(distances)[:v]
    nearest_labels = y_S[nearest_indices]

    return np.bincount(nearest_labels).argmax()

def plot_boundary(x_S, y_S, v, resolution=200):

    # Build Grid
    x_min, x_max = 0, 1
    y_min, y_max = 0, 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution), np.linspace(y_min, y_max, resolution))
    grid_points = np.c_[xx.ravel(), yy.ravel()]

    predictions = np.array([h_S_v(x_S, y_S, point, v) for point in grid_points])
    Z = predictions.reshape(xx.shape)

    custom_cmap = ListedColormap(['white', 'turquoise'])
    plt.figure(figsize=(8,6))
    plt.contourf(xx, yy, Z, alpha = 0.5, cmap = custom_cmap)
    plt.scatter(x_S[y_S == 0][:,0], x_S[y_S == 0][:,1], c = 'green', label = "Label 0")
    plt.scatter(x_S[y_S == 1][:,0], x_S[y_S == 1][:,1], c = 'blue', label = "Label 1")
    plt.title("Figure of h_{S,v}")
    plt.legend()
    plt.show()

plot_boundary(X, y , v = 3)


### 2.1.2 Estimated generalization error of k-NN as a function of k

In [None]:
def generate_noise(S, v=3, size=1000, noise_prob=0.2):
    x_S, y_S = S
    X = np.random.rand(size, 2)
    y = []
    for x in X:
        if np.random.rand() > noise_prob:
            y.append(h_S_v(x_S, y_S, x, v))
        else:
            y.append(np.random.choice([0,1]))
    return X, np.array(y)

def generalize_err(S, k_values, runs = 100, train_size = 1000, test_size = 1000):
    mean_errors = []
    for k in k_values:
        errors = []
        for _ in range(runs):
            
