Importing the necessary libraries

In [698]:
import numpy as np
import pandas as pd
from numpy.linalg import norm

Functions

In [699]:
def standardize(x):
    if x.size == 0:
        raise ValueError("Input array x is empty.")
    mean = np.mean(x, axis=0)
    std_dev = np.std(x, axis=0)
    if np.any(std_dev == 0):
        raise ValueError("Standard deviation is zero for one or more features, cannot standardize.")
    x_std = (x - mean) / std_dev
    return x_std


Covariance

In [700]:
def covariance_matrix(x):
    if x.size == 0:
        raise ValueError("Input array x is empty.")
    m = x.shape[0]
    return (1 / (m - 1)) * x.T @ x


### QR Decomposition using Givens Rotation function

In [701]:
def qr_decomposition_givens(A, tol=1e-8):
    """
    QR decomposition using Givens Rotations.
    Returns the matrices Q and R.
    """
    m, n = A.shape
    Q = np.eye(m)
    R = A.copy()

    for i in range(n):
        for j in range(i + 1, m):
            if abs(R[j, i]) < tol:
                continue
            c, s = givens_rotation(R[i, i], R[j, i])
            G = np.eye(m)
            G[i, i], G[i, j], G[j, i], G[j, j] = c, -s, s, c

            R = G @ R
            Q = Q @ G.T

    return Q, R

In [702]:
def qr_decomposition_givens_noQ(A, tol=1e-8):
    _, n = A.shape
    R = A.copy()

    for i in range(n):
        for j in range(i + 1, m):
            if abs(R[j, i]) < tol:
                continue
            c, s = givens_rotation(R[i, i], R[j, i])
            G = np.eye(m)
            G[i, i], G[i, j], G[j, i], G[j, j] = c, -s, s, c

            R = G @ R

    return R

In [703]:
def givens_rotation(a, b):
    """Compute the cos and sin for Givens rotation."""
    if b == 0:
        return 1, 0
    else:
        r = np.hypot(a, b)
        c = a / r
        s = -b / r
        return c, s

In [704]:
def qr_decomposition_givens_reduced(X, tol=1e-8):
    """
    QR decomposition using Givens Rotations.
    Returns the reduced matrices Q and R.
    """
    m, n = X.shape
    Q = np.eye(m)
    R = X.copy()

    for i in range(n):
        for j in range(i + 1, m):
            if abs(R[j, i]) < tol:
                continue
            c, s = givens_rotation(R[i, i], R[j, i])
            G = np.eye(m)
            G[i, i], G[i, j], G[j, i], G[j, j] = c, -s, s, c

            R = G @ R
            Q = Q @ G.T

    return Q, R[:n, :n]


In [705]:
def qr_decomposition_givens_reduced_noQ(A, b, tol=1e-8):
    """
    QR decomposition using Givens Rotations.
    Returns the matrix R and transformed vector x.
    """
    m, n = A.shape  
    R = A.copy()
    x = b.copy()

    for i in range(n):
        for j in range(i + 1, m):
            if abs(R[j, i]) < tol:
                continue
            c, s = givens_rotation(R[i, i], R[j, i])
            
            
            G = np.eye(m)  
            G[i, i], G[i, j], G[j, i], G[j, j] = c, -s, s, c

            R = G @ R  
            x = G @ x  

    return R[:n], x[:n]

   

In [706]:
def eigen_decomp(A, max_iter=100, tol=1e-8):
    n = A.shape[0]
    A_k = A.copy()
    Q_total = np.eye(n)
    
    for iter_num in range(max_iter):
        Q, R = qr_decomposition_givens_reduced(A_k)

        
        A_k = R @ Q
        Q_total = Q_total @ Q
        
        if np.allclose(A_k - np.diag(np.diagonal(A_k)), 0, atol=tol):
            break

    eigenvalues = np.diagonal(A_k)
    eigenvectors = Q_total

    # Normalize eigenvectors
    for i in range(n):
        norm_val = norm(eigenvectors[:, i])
        if norm_val > 0:
            eigenvectors[:, i] /= norm_val

    return eigenvalues, eigenvectors

In [707]:
def pca(x, threshold):
    # Convert the DataFrame to NumPy array if it's a Pandas DataFrame
    if isinstance(x, pd.DataFrame):
        x = x.values  # Convert to NumPy array

    x_std = standardize(x)  # Standardize the data
    cov_matrix = covariance_matrix(x_std)  # Get the covariance matrix
    eigenvalues, eigenvectors = eigen_decomp(cov_matrix)  # Perform eigen decomposition
    
    # Calculate variance ratio and determine the number of components
    total_variance = np.sum(eigenvalues)
    variance_ratio = eigenvalues / total_variance
    cumulative_variance_ratio = np.cumsum(variance_ratio)
    print(f"Explained variance ratio: {cumulative_variance_ratio}")
    
    n_components = np.argmax(cumulative_variance_ratio >= threshold) + 1  # Find the number of components

    # Select the top `n_components` eigenvectors
    V_k = eigenvectors[:, :n_components]
    
    # Project the data onto the new basis
    Z = np.dot(x_std, V_k)

    return Z, V_k, n_components


In [708]:
def normal_equation(x, y):
    if x.size == 0 or y.size == 0:
        raise ValueError("Input arrays x and y are empty.")
    x = np.c_[np.ones(x.shape[0]), x]
    return np.linalg.inv(x.T @ x) @ x.T @ y

In [709]:
def qr_with_q_equation(X, y):
    """
    Solve linear system using QR decomposition with Givens rotations.
    """
    if X.size == 0 or y.size == 0:
        raise ValueError("Input arrays X or y are empty.")

    Q, R = qr_decomposition_givens_reduced(X)
    Qt_y = Q.T @ y
    Qt_y_reduced = Qt_y[:R.shape[0]]
    theta = np.linalg.inv(R) @ Qt_y_reduced
    
    return theta


In [710]:
def qr_without_q_equation(X, y):
    if X.size == 0 or y.size == 0:
        raise ValueError("Input arrays X or y are empty.")
    
    # Call the QR decomposition with both X and y
    R, x = qr_decomposition_givens_reduced_noQ(X, y)
    
    # Solve the system R * theta = x
    return np.linalg.solve(R, x)

In [711]:
def mse(y_true, y_pred):
    if y_true.size == 0 or y_pred.size == 0:
        raise ValueError("Input arrays y_true or y_pred are empty.")
    return np.mean((y_true - y_pred) ** 2)

In [712]:
def residual_error_norm(y, predictions):
    residuals = y - predictions
    norm = np.linalg.norm(residuals)
    return norm

In [713]:
def back_substitution(U, b):
    n = U.shape[0]
    x = np.zeros(n)
    for i in range(n - 1, -1, -1):
        x[i] = (b[i] - U[i, i + 1:] @ x[i + 1:]) / U[i, i]
    return x

# Opening the dataset

In [714]:
df = pd.read_csv("pokindex_data.csv")
df

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,WinningPercentage
0,70,90,45,15,45,50,36.290323
1,40,27,60,37,50,66,36.220472
2,75,75,60,83,60,60,39.344262
3,85,115,80,105,80,50,30.630631
4,83,106,65,86,65,85,66.406250
...,...,...,...,...,...,...,...
195,50,65,64,44,48,43,21.969697
196,60,85,69,65,79,80,57.600000
197,45,50,43,40,38,62,40.441176
198,55,45,50,45,65,80,55.462185


In [715]:
X_df = df.iloc[:, :-1]
X_df

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,70,90,45,15,45,50
1,40,27,60,37,50,66
2,75,75,60,83,60,60
3,85,115,80,105,80,50
4,83,106,65,86,65,85
...,...,...,...,...,...,...
195,50,65,64,44,48,43
196,60,85,69,65,79,80
197,45,50,43,40,38,62
198,55,45,50,45,65,80


In [716]:
X_std = standardize(X_df.values)
X_std

array([[ 0.15290907,  0.4052833 , -0.92270684, -1.78806664, -1.04526282,
        -0.64904062],
       [-1.18060024, -1.59650799, -0.433985  , -1.11071036, -0.85255212,
        -0.09489708],
       [ 0.37516062, -0.07133367, -0.433985  ,  0.30558005, -0.46713073,
        -0.30270091],
       ...,
       [-0.95834869, -0.8656953 , -0.98786975, -1.01834359, -1.31505779,
        -0.23343296],
       [-0.51384558, -1.02456762, -0.75979956, -0.86439898, -0.27442003,
         0.38997851],
       [ 0.15290907,  1.6762619 ,  0.86927325, -0.55650976,  0.30371206,
        -0.12953105]])

In [717]:
cov_matrix = covariance_matrix(X_std)
cov_matrix

array([[ 1.00502513,  0.55595839,  0.40771065,  0.42310765,  0.41420563,
         0.25004053],
       [ 0.55595839,  1.00502513,  0.49285226,  0.45023169,  0.28881028,
         0.39254772],
       [ 0.40771065,  0.49285226,  1.00502513,  0.26511939,  0.59231642,
        -0.05218178],
       [ 0.42310765,  0.45023169,  0.26511939,  1.00502513,  0.52750921,
         0.50622821],
       [ 0.41420563,  0.28881028,  0.59231642,  0.52750921,  1.00502513,
         0.19389947],
       [ 0.25004053,  0.39254772, -0.05218178,  0.50622821,  0.19389947,
         1.00502513]])

In [718]:
# QR decomposition with Givens rotations
Q, R = qr_decomposition_givens(cov_matrix)
print("Q: ", Q)
print("R: ", R)


Q:  [[ 0.72940987 -0.54500747 -0.24840545 -0.30687205 -0.12216702  0.01129624]
 [ 0.40349392  0.75994714 -0.16466594 -0.32524921  0.14040878 -0.32719482]
 [ 0.29590123  0.18570912  0.78317796 -0.10122822 -0.36807509  0.34476142]
 [ 0.3070758   0.09953746 -0.10013326  0.78853923 -0.41568206 -0.30196751]
 [ 0.30061505 -0.13379389  0.38231174  0.32291456  0.79276163 -0.11323736]
 [ 0.18147012  0.25129826 -0.37632858  0.25244288  0.16917076  0.81850378]]
R:  [[ 1.37786061e+00  1.25319031e+00  1.04364078e+00  1.12779520e+00
   1.09322398e+00  7.21453556e-01]
 [ 2.20613268e-17  6.57112083e-01  2.73006452e-01  3.17464948e-01
   7.05012515e-02  4.29358093e-01]
 [-1.15492757e-17  1.44232748e-17  8.24219748e-01 -6.10760271e-02
   5.71882787e-01 -5.22398205e-01]
 [ 4.58149373e-17 -1.45547223e-17 -1.75052304e-18  7.87520880e-01
   5.08444916e-01  5.16381194e-01]
 [-4.59659157e-19 -1.77982212e-17  2.41815066e-17  1.11746041e-17
   3.82203678e-01  1.57084194e-01]
 [-1.99857047e-17 -2.02497546e-17 -1

In [719]:
eigenvalues, eigenvectors = eigen_decomp(cov_matrix)
print("Eigenvalues: ", eigenvalues)
print("Eigenvectors: ", eigenvectors)

Eigenvalues:  [2.94914287 1.20231921 0.77529324 0.51470719 0.37558598 0.21310226]
Eigenvectors:  [[ 0.43354602 -0.06079579  0.39576924 -0.77440083  0.1765609   0.14435383]
 [ 0.44686717  0.06341416  0.57662134  0.39600227 -0.21287498 -0.51152972]
 [ 0.38639019 -0.57758995  0.0344706   0.41114813  0.10449576  0.57960513]
 [ 0.44243289  0.30046    -0.38034996 -0.09211419 -0.71540399  0.22141125]
 [ 0.42577579 -0.27598308 -0.6008831  -0.07390754  0.3132315  -0.52717315]
 [ 0.29328879  0.70159069 -0.06292644  0.24595817  0.55015842  0.2337151 ]]


In [720]:
Z, V_k, n_components = pca(X_std, 0.90)
principal_components = pd.DataFrame(Z, columns=[f"PC{i}" for i in range(1, n_components + 1)])
principal_components


Explained variance ratio: [0.48906619 0.6884508  0.81702026 0.90237587 0.96466054 1.        ]


Unnamed: 0,PC1,PC2,PC3,PC4
0,-1.535627,-0.154778,1.611419,-0.254967
1,-2.275202,0.056187,-0.462071,0.245586
2,-0.189388,0.231696,0.275896,-0.565281
3,1.349380,-0.343314,0.508127,-0.342828
4,0.844757,0.760486,0.784812,-0.193403
...,...,...,...,...
195,-1.663667,-0.442388,0.429042,0.223010
196,0.046590,0.240373,-0.067483,0.364742
197,-2.262974,0.467138,0.279696,0.126750
198,-1.359104,0.494743,-0.351214,-0.124375


In [721]:
V_k

array([[ 0.43354602, -0.06079579,  0.39576924, -0.77440083],
       [ 0.44686717,  0.06341416,  0.57662134,  0.39600227],
       [ 0.38639019, -0.57758995,  0.0344706 ,  0.41114813],
       [ 0.44243289,  0.30046   , -0.38034996, -0.09211419],
       [ 0.42577579, -0.27598308, -0.6008831 , -0.07390754],
       [ 0.29328879,  0.70159069, -0.06292644,  0.24595817]])

In [722]:
n_components

np.int64(4)

# Linear Regression

In [723]:
y = df.iloc[:, -1]
y

0      36.290323
1      36.220472
2      39.344262
3      30.630631
4      66.406250
         ...    
195    21.969697
196    57.600000
197    40.441176
198    55.462185
199    60.800000
Name: WinningPercentage, Length: 200, dtype: float64

In [724]:
X = np.c_[np.ones(principal_components.shape[0]), principal_components]

theta_normal = np.linalg.pinv(X.T @ X) @ X.T @ y

y_pred_normal = pd.Series(np.dot(X, theta_normal))
y_pred_normal

0      35.029023
1      32.274892
2      48.278116
3      54.918160
4      68.328011
         ...    
195    31.295163
196    56.409055
197    38.697193
198    44.247542
199    55.378288
Length: 200, dtype: float64

In [725]:
normal_error = residual_error_norm(y, y_pred_normal)
normal_error

np.float64(151.0593195136738)

# Stored Q QR decomposition using Givens Rotation

In [726]:
theta_qr_q = qr_with_q_equation(X, y)
theta_qr_q

array([50.01524827,  8.62091422, 14.78978101,  1.43521507,  6.94724443])

In [727]:
y_pred_qr = pd.Series(np.dot(X, theta_qr_q))
y_pred_qr

0      35.029023
1      32.274892
2      48.278116
3      54.918160
4      68.328011
         ...    
195    31.295163
196    56.409055
197    38.697193
198    44.247542
199    55.378288
Length: 200, dtype: float64

In [728]:
qr_with_q_error = residual_error_norm(y, y_pred_qr)
qr_with_q_error

np.float64(151.05931951367384)

# Not Stored Q QR decomposition using Givens Rotation

In [729]:
theta_qr_no_q = qr_without_q_equation(X, y)
theta_qr_no_q

array([50.01524827,  8.62091422, 14.78978101,  1.43521507,  6.94724443])

In [730]:
y_pred_qr_no_q = pd.Series(np.dot(X, theta_qr_no_q))
y_pred_qr_no_q

0      35.029023
1      32.274892
2      48.278116
3      54.918160
4      68.328011
         ...    
195    31.295163
196    56.409055
197    38.697193
198    44.247542
199    55.378288
Length: 200, dtype: float64

In [731]:
error_qr_no_q = residual_error_norm(y, y_pred_qr_no_q)
error_qr_no_q

np.float64(151.0593195136738)

# QR Decomposition using no outlier dataset

In [732]:
no_outlier_df = pd.read_csv("pokindex_data_nooutlier.csv")
no_outlier_df

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,WinningPercentage
0,70,90,45,15,45,50,36.290323
1,40,27,60,37,50,66,36.220472
2,75,75,60,83,60,60,39.344262
3,85,115,80,105,80,50,30.630631
4,83,106,65,86,65,85,66.406250
...,...,...,...,...,...,...,...
179,50,65,64,44,48,43,21.969697
180,60,85,69,65,79,80,57.600000
181,45,50,43,40,38,62,40.441176
182,55,45,50,45,65,80,55.462185


In [733]:
X_no_outlier_df = no_outlier_df.iloc[:, :-1]
X_no_outlier_df

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,70,90,45,15,45,50
1,40,27,60,37,50,66
2,75,75,60,83,60,60
3,85,115,80,105,80,50
4,83,106,65,86,65,85
...,...,...,...,...,...,...
179,50,65,64,44,48,43
180,60,85,69,65,79,80
181,45,50,43,40,38,62
182,55,45,50,45,65,80


In [734]:
X_no_outlier_std = standardize(X_no_outlier_df)
X_no_outlier_std

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,0.281459,0.524904,-0.942009,-1.831199,-1.030554,-0.641841
1,-1.255290,-1.658336,-0.362828,-1.102321,-0.819675,-0.058457
2,0.537584,0.005085,-0.362828,0.421698,-0.397918,-0.277226
3,1.049834,1.391270,0.409414,1.150577,0.445595,-0.641841
4,0.947384,1.079378,-0.169767,0.521091,-0.187040,0.634311
...,...,...,...,...,...,...
179,-0.743041,-0.341461,-0.208379,-0.870405,-0.904027,-0.897072
180,-0.230791,0.351631,-0.015319,-0.174657,0.403420,0.452004
181,-0.999165,-0.861280,-1.019233,-1.002928,-1.325783,-0.204303
182,-0.486916,-1.034553,-0.748948,-0.837274,-0.187040,0.452004


In [735]:
print("Shape of cov_matrix:", cov_matrix.shape)
print("Shape of cov_matrix_no_outlier:", cov_matrix_no_outlier.shape)


Shape of cov_matrix: (6, 6)
Shape of cov_matrix_no_outlier: (6, 6)


In [736]:
cov_matrix_no_outlier = covariance_matrix(X_no_outlier_std)
cov_matrix_no_outlier

Unnamed: 0,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
HP,1.005464,0.577163,0.469326,0.419522,0.493828,0.263993
Attack,0.577163,1.005464,0.536234,0.396956,0.299295,0.377378
Defense,0.469326,0.536234,1.005464,0.226107,0.569609,0.003423
Sp. Atk,0.419522,0.396956,0.226107,1.005464,0.557707,0.50454
Sp. Def,0.493828,0.299295,0.569609,0.557707,1.005464,0.247565
Speed,0.263993,0.377378,0.003423,0.50454,0.247565,1.005464


In [737]:
# Convert DataFrame to NumPy array
cov_matrix_no_outlier_np = cov_matrix_no_outlier.values  # .values gives the NumPy array

# Run the eigen_decomp function on the NumPy array
eigenvalues_no_outlier, eigenvectors_no_outlier = eigen_decomp(cov_matrix_no_outlier_np)

# Print the results
print("Eigenvalues: ", eigenvalues_no_outlier)
print("Eigenvectors: ", eigenvectors_no_outlier)


Eigenvalues:  [3.02292626 1.15561446 0.78339176 0.48381804 0.3862541  0.20078226]
Eigenvectors:  [[ 4.48805593e-01 -1.41864165e-01  2.09458630e-01 -7.94845715e-01
   2.60997529e-01  1.86214285e-01]
 [ 4.35537464e-01 -6.25359359e-02  6.34115283e-01  1.96910936e-01
  -3.31358611e-01 -5.05689367e-01]
 [ 3.91057710e-01 -5.68664284e-01 -7.93455615e-04  4.73103242e-01
   7.26372642e-02  5.42762674e-01]
 [ 4.18565977e-01  3.96964850e-01 -3.53707332e-01 -8.46728375e-02
  -6.76443247e-01  2.78150317e-01]
 [ 4.37304895e-01 -1.35865219e-01 -6.29906411e-01  6.80385809e-02
   2.83274169e-01 -5.55562310e-01]
 [ 2.99201398e-01  6.90321880e-01  1.79257627e-01  3.06302068e-01
   5.28191128e-01  1.70276552e-01]]


In [739]:
Z_no_outlier, V_k_no_outlier, n_components_no_outlier = pca(X_no_outlier_df, 0.90)
print(V_k_no_outlier)
print(n_components_no_outlier)

Explained variance ratio: [0.50108289 0.69263854 0.82249424 0.90269234 0.96671816 1.        ]
[[ 4.48805593e-01 -1.41864165e-01  2.09458630e-01 -7.94845715e-01]
 [ 4.35537464e-01 -6.25359359e-02  6.34115283e-01  1.96910936e-01]
 [ 3.91057710e-01 -5.68664284e-01 -7.93455615e-04  4.73103242e-01]
 [ 4.18565977e-01  3.96964850e-01 -3.53707332e-01 -8.46728375e-02]
 [ 4.37304895e-01 -1.35865219e-01 -6.29906411e-01  6.80385809e-02]
 [ 2.99201398e-01  6.90321880e-01  1.79257627e-01  3.06302068e-01]]
4


In [741]:
pca_no_outlier = pd.DataFrame(data=Z_no_outlier, columns=[f"PC{i+1}" for i in range(n_components_no_outlier)])
pca_no_outlier

Unnamed: 0,PC1,PC2,PC3,PC4
0,-1.422628,-0.567050,1.574357,-0.677686
1,-2.264868,0.121542,-0.418481,0.519224
2,0.021149,0.159832,0.167913,-0.745645
3,1.721639,-0.515637,0.299092,-0.630507
4,1.155016,0.564787,0.930232,-0.483357
...,...,...,...,...
179,-1.591750,-0.596700,0.344517,0.162196
180,0.282130,0.207347,0.063332,0.426122
181,-2.282821,0.416174,0.398614,0.074522
182,-1.259007,0.564745,-0.262428,0.025596


In [742]:
X_no_outlier = np.c_[np.ones(pca_no_outlier.shape[0]), pca_no_outlier]
y_no_outlier = no_outlier_df.iloc[:, -1]

print(y_no_outlier)
print(X_no_outlier)

0      36.290323
1      36.220472
2      39.344262
3      30.630631
4      66.406250
         ...    
179    21.969697
180    57.600000
181    40.441176
182    55.462185
183    60.800000
Name: WinningPercentage, Length: 184, dtype: float64
[[ 1.00000000e+00 -1.42262756e+00 -5.67050141e-01  1.57435736e+00
  -6.77686276e-01]
 [ 1.00000000e+00 -2.26486787e+00  1.21541791e-01 -4.18481163e-01
   5.19224230e-01]
 [ 1.00000000e+00  2.11492175e-02  1.59832372e-01  1.67912789e-01
  -7.45644968e-01]
 [ 1.00000000e+00  1.72163913e+00 -5.15636690e-01  2.99091619e-01
  -6.30506767e-01]
 [ 1.00000000e+00  1.15501604e+00  5.64786593e-01  9.30231842e-01
  -4.83357240e-01]
 [ 1.00000000e+00  3.50393395e-01 -8.73515278e-01  3.06030683e-01
  -1.71766876e+00]
 [ 1.00000000e+00  1.02045756e+00 -8.88399150e-01  9.52557940e-03
   1.01144339e+00]
 [ 1.00000000e+00  6.42387218e-01 -1.13643597e+00 -3.62379635e-01
   1.43790828e+00]
 [ 1.00000000e+00 -4.51907043e-01 -1.32921715e+00 -5.73564952e-01
   4.83750155e

In [744]:
theta_qr_no_outlier = qr_with_q_equation(X_no_outlier, y_no_outlier)
theta_qr_no_outlier

array([49.30751232,  8.69087945, 13.77761601,  6.40859259,  8.2407876 ])

In [745]:
y_pred_qr_with_q_no_outlier = pd.Series(np.dot(X_no_outlier, theta_qr_no_outlier))
y_pred_qr_with_q_no_outlier

0      33.635775
1      32.895316
2      46.624810
3      53.886710
4      69.105263
         ...    
179    30.797189
180    58.533670
181    38.370348
182    44.675611
183    58.907385
Length: 184, dtype: float64

In [746]:
error_qr_without_outlier = residual_error_norm(y_no_outlier, y_pred_qr_with_q_no_outlier)
error_qr_without_outlier

np.float64(129.57043630372434)