In [None]:
# Name: Tamarra, Bessa Nicole T.
# PREFINAL EXAM CS 3101N


# Customized Program Made from Scratch
import random

def load_arff(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    data_start = lines.index('@data\n') + 1
    attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]
    
    return [{attr: val if val != 'm' else None for attr, val in zip(attributes, line.strip().split(','))} for line in lines[data_start:]]


def replace_m_with_none(data_list):
    return [{key: None if value == 'm' else value for key, value in entry.items()} for entry in data_list]


def linear_interpolation(data_list):
    return [
        {key: (float(data_list[i - 1][key]) + float(data_list[i + 1][key])) / 2 if value is None and data_list[i - 1][key] is not None and data_list[i + 1][key] is not None else value for key, value in entry.items()}
        for i, entry in enumerate(data_list[1:-1], start=1)
    ]


def zscore_standardization(matrix):
    for i in range(2, len(matrix[0])):
        column = [float(row[i]) for row in matrix if row[i] not in (None, 'm')]
        
        if len(set(column)) == 1:
            continue
        
        mean_val = sum(column) / len(column) if len(column) > 1 else 0
        std_dev = (sum((x - mean_val) ** 2 for x in column) / len(column)) ** 0.5 if len(column) > 1 else 0

        for row in matrix:
            if row[i] not in (None, 'm'):
                row[i] = (float(row[i]) - mean_val) / std_dev if std_dev != 0 else 0
    return matrix

def multiply_matrices(matrix1, matrix2):
    return [[dot_product(row, col) for col in transpose(matrix2)] for row in matrix1]


def transpose(matrix):
    return [[row[i] for row in matrix] for i in range(len(matrix[0]))]

def multiply_matrix_vector(matrix, vector):
    return [sum(x * y for x, y in zip(row, vector)) for row in matrix]

def dot_product(v1, v2):
    return sum(x * y for x, y in zip(v1, v2) if isinstance(x, (int, float)))

def mean(column):
    values = [float(val) for val in column if val is not None]
    return sum(values) / len(values) if values else 0

def covariance_matrix(matrix):
    num_features = len(matrix[0])
    transposed_matrix = transpose(matrix)
    
    def is_numeric(val):
        return val is not None and val != 'm' and isinstance(val, (int, float))

    cov_matrix = [
        [
            sum(
                (float(val_i) - mean_i) * (float(val_j) - mean_j)
                for val_i, val_j in zip(matrix[i], transposed_matrix[j])
                if is_numeric(val_i) and is_numeric(val_j)
            )
            / (len(matrix) - 1)
            for j, mean_j in enumerate(map(mean, transposed_matrix))
        ]
        for i, mean_i in enumerate(map(mean, matrix))
    ]
    return cov_matrix



def custom_random():
    seed = 1
    while True:
        seed = (seed * 1103515245 + 12345) & 0x7FFFFFFF
        yield seed / 0x7FFFFFFF
        
def pca(data_matrix, num_components):
    cov_matrix = covariance_matrix(data_matrix)

    # Function to perform matrix multiplication
    def multiply_matrix_vector(matrix, vector):
        return [sum(a * b for a, b in zip(row, vector)) for row in matrix]

    # Function to normalize a vector
    def normalize_vector(vector):
        magnitude = sum(x ** 2 for x in vector) ** 0.5
        return [x / magnitude for x in vector]

    num_features = len(data_matrix[0])
    eigenvectors = [[0.0] * num_features for _ in range(num_features)]

    # Power iteration for each eigenvector
    for i in range(num_features):
        vector = [random.random() for _ in range(num_features)]

        for _ in range(1000):
            new_vector = multiply_matrix_vector(cov_matrix, vector)
            vector = normalize_vector(new_vector)

        eigenvectors[i] = vector

    # Sort eigenvectors by eigenvalues in descending order
    sorted_indices = sorted(range(num_features), key=lambda k: sum(x ** 2 for x in eigenvectors[k]), reverse=True)
    top_eigenvectors = [[eigenvectors[j][i] for j in sorted_indices] for i in range(num_features)][:num_components]

    # Project the data onto the new subspace defined by the top eigenvectors
    pca_result = multiply_matrix(data_matrix, transpose(top_eigenvectors))

    return pca_result

    
def svd(matrix, num_iterations=1000):
    num_rows, num_cols = len(matrix), len(matrix[0])

    # Transpose the matrix
    transposed_matrix = transpose(matrix)

    # Compute the covariance matrix
    covariance_matrix = multiply_matrix(matrix, transposed_matrix)

    # Function to perform matrix multiplication
    def multiply_matrix_vector(matrix, vector):
        return [sum(a * b for a, b in zip(row, vector)) for row in matrix]

    # Initialize a list for eigenvectors
    eigenvectors = [[1.0 if j == i else 0.0 for j in range(num_cols)] for i in range(num_cols)]

    # Power iteration for each eigenvector
    for _ in range(num_iterations):
        eigenvectors = [multiply_matrix_vector(covariance_matrix, vector) for vector in eigenvectors]

    # Normalize the eigenvectors
    eigenvectors = [[x / (sum(y ** 2 for y in vector) ** 0.5) for x in vector] for vector in eigenvectors]

    # Compute the SVD result
    svd_result = multiply_matrice(transposed_matrix, eigenvectors)

    return svd_result


def display_data_table(data_list):
    attributes = list(data_list[0].keys())
    column_widths = {attr: max(len(attr), max(len(str(entry[attr])) for entry in data_list)) for attr in attributes}

    header = "|".join(f"{attr:^{column_widths[attr]}}" for attr in attributes)
    print(header)
    print("-" * sum(column_widths.values()))

    for entry in data_list:
        row = "|".join(f"{str(entry[attr]):^{column_widths[attr]}}" if entry[attr] is not None else '' for attr in attributes)
        print(row)


def display_matrix(matrix):
    for row in matrix:
        print("|".join(f"{str(cell):^10}" for cell in row))

file_path_2017 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2017.arff'
file_path_2018 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2018.arff'
file_path_2019 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2019.arff'
file_path_2020 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2020.arff'
file_path_2021 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2021 Q1.arff'


data_2017 = load_arff(file_path_2017)
data_2018 = load_arff(file_path_2018)
data_2019 = load_arff(file_path_2019)
data_2020 = load_arff(file_path_2020)
data_2021 = load_arff(file_path_2021)

data_2017_preprocessed = replace_m_with_none(data_2017)
data_2018_preprocessed = replace_m_with_none(data_2018)
data_2019_preprocessed = replace_m_with_none(data_2019)
data_2020_preprocessed = replace_m_with_none(data_2020)
data_2021_preprocessed = replace_m_with_none(data_2021)


data_2017_preprocessed = linear_interpolation(data_2017_preprocessed)
data_2018_preprocessed = linear_interpolation(data_2018_preprocessed)
data_2019_preprocessed = linear_interpolation(data_2019_preprocessed)
data_2020_preprocessed = linear_interpolation(data_2020_preprocessed)
data_2021_preprocessed = linear_interpolation(data_2021_preprocessed)

data_combined = data_2017_preprocessed + data_2018_preprocessed + data_2019_preprocessed + data_2020_preprocessed + data_2021_preprocessed
display_data_table(data_combined)  

attributes = list(data_combined[0].keys())

matrix = []
for entry in data_combined:
    row = [entry[attr] for attr in attributes[2:]] # excludes 'year' and 'quarter' attributes
    matrix.append(row)

standardized_data = zscore_standardization(matrix)

num_components = 2  # Set the desired number of principal components

pca_result = pca(standardized_data, num_components) # perform PCA
print("\nPCA processing completed.")

svd_result = svd(standardized_data) # perform SVD
print("\nSVD processing completed.")

print("\nRESULT WITH PCA:")
display_matrix(pca_result)

print("\nRESULT WITH SVD:")
display_matrix(svd_result)

Num|    Country     |         X1          |        X2         |         X3          |        X4        |         X5          |         X6          |         X7          |        X8         |         X9          |         X10          |         X11          |         X12          |         X13         |         X14         |       X15        |         X16          |         X17          |         X18          |         X19          |        X20        |         X21          |        X22        |         X23         |       X24        |         X25         |         X26         |        X27        |        X28        |        X29         |        X30        |        X31        |         X32         |       X33        |        X34         |         X35         |         X36          |        X37        |         X38         |         X39          |        X40        |        X41        |        X42         |        X43        |       X44        |         X45         |         X46         

In [1]:
# Program Made with sklearn

import numpy as np
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

def load_arff(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    data_start = lines.index('@data\n') + 1
    attributes = [line.split()[1] for line in lines if line.startswith('@attribute')]
    
    return [{attr: val if val != 'm' else None for attr, val in zip(attributes, line.strip().split(','))} for line in lines[data_start:]]


def replace_m_with_none(data_list):
    return [{key: None if value == 'm' else value for key, value in entry.items()} for entry in data_list]


def linear_interpolation(data_list):
    return [
        {key: (float(data_list[i - 1][key]) + float(data_list[i + 1][key])) / 2 if value is None and data_list[i - 1][key] is not None and data_list[i + 1][key] is not None else value for key, value in entry.items()}
        for i, entry in enumerate(data_list[1:-1], start=1)
    ]


def zscore_standardization(matrix):
    for i in range(2, len(matrix[0])):
        column = [float(row[i]) for row in matrix if row[i] not in (None, 'm')]
        
        if len(set(column)) == 1:
            continue
        
        mean_val = sum(column) / len(column) if len(column) > 1 else 0
        std_dev = (sum((x - mean_val) ** 2 for x in column) / len(column)) ** 0.5 if len(column) > 1 else 0

        for row in matrix:
            if row[i] not in (None, 'm'):
                row[i] = (float(row[i]) - mean_val) / std_dev if std_dev != 0 else 0
    return matrix


def custom_random():
    seed = 1
    while True:
        seed = (seed * 1103515245 + 12345) & 0x7FFFFFFF
        yield seed / 0x7FFFFFFF


def svd_sklearn(data_matrix, num_components):
    imputer = SimpleImputer(strategy='mean')
    data_matrix_imputed = imputer.fit_transform(data_matrix)

    scaler = StandardScaler()
    data_matrix_standardized = scaler.fit_transform(data_matrix_imputed)

    svd = TruncatedSVD(n_components=num_components)
    svd_result = svd.fit_transform(data_matrix_standardized)

    return svd_result


def pca_sklearn_with_imputation(data_matrix, num_components):
    imputer = SimpleImputer(strategy='mean')
    data_matrix_imputed = imputer.fit_transform(data_matrix)

    scaler = StandardScaler()
    data_matrix_standardized = scaler.fit_transform(data_matrix_imputed)

    pca = PCA(n_components=num_components)
    pca_result = pca.fit_transform(data_matrix_standardized)

    return pca_result

def display_data_table(data_list):
    attributes = list(data_list[0].keys())
    column_widths = {attr: max(len(attr), max(len(str(entry[attr])) for entry in data_list)) for attr in attributes}

    header = "|".join(f"{attr:^{column_widths[attr]}}" for attr in attributes)
    print(header)
    print("-" * sum(column_widths.values()))

    for entry in data_list:
        row = "|".join(f"{str(entry[attr]):^{column_widths[attr]}}" if entry[attr] is not None else '' for attr in attributes)
        print(row)


def display_matrix(matrix):
    for row in matrix:
        print("|".join(f"{str(cell):^10}" for cell in row))


file_path_2017 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2017.arff'
file_path_2018 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2018.arff'
file_path_2019 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2019.arff'
file_path_2020 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2020.arff'
file_path_2021 = r'C:\Users\bessa\Desktop\CS3101N\V4 data\2021 Q1.arff'

data_2017 = load_arff(file_path_2017)
data_2018 = load_arff(file_path_2018)
data_2019 = load_arff(file_path_2019)
data_2020 = load_arff(file_path_2020)
data_2021 = load_arff(file_path_2021)

data_2017_preprocessed = replace_m_with_none(data_2017)
data_2018_preprocessed = replace_m_with_none(data_2018)
data_2019_preprocessed = replace_m_with_none(data_2019)
data_2020_preprocessed = replace_m_with_none(data_2020)
data_2021_preprocessed = replace_m_with_none(data_2021)

data_2017_preprocessed = linear_interpolation(data_2017_preprocessed)
data_2018_preprocessed = linear_interpolation(data_2018_preprocessed)
data_2019_preprocessed = linear_interpolation(data_2019_preprocessed)
data_2020_preprocessed = linear_interpolation(data_2020_preprocessed)
data_2021_preprocessed = linear_interpolation(data_2021_preprocessed)


data_combined = data_2017_preprocessed + data_2018_preprocessed + data_2019_preprocessed + data_2020_preprocessed + data_2021_preprocessed
display_data_table(data_combined)


attributes = list(data_combined[0].keys())


matrix = []
for entry in data_combined:
    row = [entry[attr] for attr in attributes[2:]] 
    matrix.append(row)


standardized_data = zscore_standardization(matrix)

num_components = 2
pca_result_sklearn = pca_sklearn_with_imputation(standardized_data, num_components)


print("\nPCA RESULT WITH SKLEARN:")
display_matrix(pca_result_sklearn)

svd_result_sklearn = svd_sklearn(standardized_data, num_components)

print("\nSVD RESULT WITH SKLEARN:")
display_matrix(svd_result_sklearn)

Num|    Country     |         X1          |        X2         |         X3          |        X4        |         X5          |         X6          |         X7          |        X8         |         X9          |         X10          |         X11          |         X12          |         X13         |         X14         |       X15        |         X16          |         X17          |         X18          |         X19          |        X20        |         X21          |        X22        |         X23         |       X24        |         X25         |         X26         |        X27        |        X28        |        X29         |        X30        |        X31        |         X32         |       X33        |        X34         |         X35         |         X36          |        X37        |         X38         |         X39          |        X40        |        X41        |        X42         |        X43        |       X44        |         X45         |         X46         

#### Conclusion <br>
When comparing the custom PCA and SVD programs with the ones in scikit-learn, some minor number differences were noticed, but the overall patterns were similar. Both programs used linear interpolation and z-score standardization to deal with missing data and make sure the numbers were comparable. The slight differences in values happened because the custom program and scikit-learn used different methods, but manual calculations and scikit-learn's automated ones matched up overall. Everything was double checked to make sure the program was doing things right, and in the end, the two programs were quite similar.