In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import StandardScaler 
from sklearn.decomposition import PCA 
from sklearn.model_selection import train_test_split 
import statsmodels.api as sm 

In [None]:
real_ouput_data_loc = "data/hw1_real.csv" 

real_output_df = pd.read_csv(real_ouput_data_loc)

img_output_data_loc = "data/hw1_img.csv" 

img_output_df = pd.read_csv(img_output_data_loc)

output_df = np.sqrt(real_output_df **2 + img_output_df **2) 

output_df.head()

In [None]:
plt.figure(figsize=(12, 6))

plt.plot(output_df.iloc[2], label="Magnitude")
plt.plot(real_output_df.iloc[2], label="Real part")
plt.plot(img_output_df.iloc[2], label="İmaginary part")

plt.title("S11 Parameter of the 3rd Sample")
plt.legend(loc="best", fontsize=10)

plt.xticks(range(0, 200, 25))
plt.xlabel("Frequency")
plt.show()

In [None]:
plt.figure(figsize=(12, 6))

for i in range(0,15):
    plt.plot(output_df.iloc[i], label=f"output {i}")

plt.legend(loc="best", fontsize=6)

plt.title("Magnitude of the first 15 outputs")

plt.xticks(range(0, 200, 25))
plt.xlabel("Frequency")
plt.show()

In [5]:
min_values_output = output_df.apply(lambda x: x.nsmallest(25).mean(), axis=1)
min_values_index = output_df.idxmin(axis=1)

min_values_real = real_output_df.apply(lambda x: x.nsmallest(25).mean(), axis=1)
min_values_img = img_output_df.apply(lambda x: x.nsmallest(25).mean(), axis=1)

In [None]:
plt.figure(figsize=(12, 6))

for i in range(0,10):
    plt.plot(output_df.iloc[i], label=f"output {i}")
    plt.scatter(min_values_index[i], min_values_output[i], color="red")

plt.legend(loc="best", fontsize=6)
plt.title("Magnitued of output 0 to 9 with their avg. minimum values")
plt.xticks(range(0, 200, 25))
plt.xlabel("Frequency")
plt.show()

In [None]:
pca = PCA(n_components=1) 

principal_components = pca.fit_transform(output_df) 

explained_variance_df = pd.DataFrame({
    'Standard deviation': np.sqrt(pca.explained_variance_),
    'Proportion of Variance': pca.explained_variance_ratio_,
}, index=[f'Comp.{i+1}' for i in range(len(pca.explained_variance_))])

explained_variance_df

In [None]:
pca = PCA() 

principal_components = pca.fit_transform(output_df) 

explained_variance_df = pd.DataFrame({
    'Standard deviation': np.sqrt(pca.explained_variance_),
    'Proportion of Variance': pca.explained_variance_ratio_,
    'Cumulative Proportion': np.cumsum(pca.explained_variance_ratio_)
}, index=[f'Comp.{i+1}' for i in range(len(pca.explained_variance_))])

explained_variance_df

In [None]:
pca = PCA(n_components=1) 

principal_components_real = pca.fit_transform(real_output_df) 

explained_variance_df = pd.DataFrame({
    'Standard deviation': np.sqrt(pca.explained_variance_),
    'Proportion of Variance': pca.explained_variance_ratio_,
}, index=[f'Comp.{i+1}' for i in range(len(pca.explained_variance_))])

explained_variance_df

In [None]:
pca = PCA() 

principal_components = pca.fit_transform(real_output_df) 

explained_variance_df = pd.DataFrame({
    'Standard deviation': np.sqrt(pca.explained_variance_),
    'Proportion of Variance': pca.explained_variance_ratio_,
    'Cumulative Proportion': np.cumsum(pca.explained_variance_ratio_)
}, index=[f'Comp.{i+1}' for i in range(len(pca.explained_variance_))])

explained_variance_df

In [None]:
pca = PCA(n_components=1) 

principal_components_img = pca.fit_transform(img_output_df) 

explained_variance_df = pd.DataFrame({
    'Standard deviation': np.sqrt(pca.explained_variance_),
    'Proportion of Variance': pca.explained_variance_ratio_,
}, index=[f'Comp.{i+1}' for i in range(len(pca.explained_variance_))])

explained_variance_df

In [None]:
pca = PCA() 

principal_components = pca.fit_transform(img_output_df) 

explained_variance_df = pd.DataFrame({
    'Standard deviation': np.sqrt(pca.explained_variance_),
    'Proportion of Variance': pca.explained_variance_ratio_,
    'Cumulative Proportion': np.cumsum(pca.explained_variance_ratio_)
}, index=[f'Comp.{i+1}' for i in range(len(pca.explained_variance_))])

explained_variance_df

In [None]:
input_data_loc = "data/hw1_input.csv"

input_df = pd.read_csv(input_data_loc)

input_df.head()

In [None]:
input_df.describe() 

In [None]:
input_df.corr() 

In [None]:
pair_plot = sns.pairplot(input_df, kind='scatter', diag_kind='kde', markers='o', plot_kws={'alpha':0.5}) 

plt.show()

In [None]:
plt.figure(figsize=(12, 6))

plt.scatter(input_df["height of substrate"], input_df["width of patch"])

plt.title("Width of Patch vs Height of Substrate")
plt.xlabel("Height of Substrate")
plt.ylabel("Width of Patch")
plt.show()

In [18]:
input_df["width of patch combined with height of substrate"] = np.where((input_df["width of patch"] > 4) & (input_df["height of substrate"] > 0.4), 1, 0)

input_df.drop(["width of patch", "height of substrate"], axis=1, inplace=True)

In [None]:
color = {0: "red", 1: "blue"} 

plt.figure(figsize=(10, 6))
plt.scatter(range(len(min_values_index)), min_values_output, 
            c=input_df["width of patch combined with height of substrate"].map(color)) 

plt.title("Minimum Magnitudes with respect to width of patch combined with height of substrate")
plt.ylabel("Minimum Magnitudes")
plt.xticks([])
plt.show()

In [None]:
scaler = StandardScaler() 

input_df_scaled = pd.DataFrame(scaler.fit_transform(input_df), columns=input_df.columns) 

input_df_scaled.head()

In [None]:
input_df_scaled.corrwith(min_values_output).sort_values(ascending=False)

In [22]:
pca_df = input_df_scaled.copy()

pca_df.drop("width of patch combined with height of substrate", axis=1, inplace=True) 

In [None]:
pca = PCA() 

principal_components = pca.fit(pca_df) 

explained_variance_df = pd.DataFrame({
    'Standard deviation': np.sqrt(pca.explained_variance_),
    'Proportion of Variance': pca.explained_variance_ratio_,
    'Cumulative Proportion': np.cumsum(pca.explained_variance_ratio_)
}, index=[f'Comp.{i+1}' for i in range(len(pca.explained_variance_))])

explained_variance_df

In [None]:
loadings_df = pd.DataFrame(pca.components_.T, columns=[f'Comp.{i+1}' for i in range(len(pca.explained_variance_))], index=pca_df.columns)

loadings_df

In [None]:
explained_variance = explained_variance_df["Proportion of Variance"]

plt.figure(figsize=(8, 6))
plt.plot(range(1, len(explained_variance) + 1), explained_variance, marker='o', linestyle='--', color='b')
plt.axhline(y=0.1, color='r', linestyle='--') 

plt.title("Explained Variance of Principal Components")
plt.xlabel('Principal Components')
plt.ylabel('Variance Explained')
plt.xticks(range(1, len(explained_variance) + 1))
plt.grid()
plt.show()

In [None]:
cumulative_variance = explained_variance_df["Cumulative Proportion"]

plt.figure(figsize=(8, 6))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--', color='b')
plt.axhline(y=0.8, color='r', linestyle='--')  

plt.title('Cumulative Explained Variance of Principal Components')
plt.xlabel('Principal Components')
plt.ylabel('Variance Explained')
plt.xticks(range(1, len(cumulative_variance) + 1))
plt.grid()
plt.show()

In [None]:
transformed_input_df = pd.DataFrame(pca.transform(pca_df), columns=[f'PC{i+1}' for i in range(len(pca.explained_variance_))]) 

transformed_input_df.drop(columns=["PC9", "PC8"], inplace=True, axis=1) 

transformed_input_df["width of patch combined with height of substrate"] = input_df_scaled["width of patch combined with height of substrate"] 

transformed_input_df.head()

In [None]:
X_train_transformed, X_test_transformed, y_train_min_real, y_test_min_real = train_test_split(transformed_input_df, min_values_real, test_size=0.2, random_state=5)
X_train, X_test, y_train_pca, y_test_pca = train_test_split(input_df_scaled, principal_components_real, test_size=0.2, random_state=5)

X_train_transformed = sm.add_constant(X_train_transformed)
X_train = sm.add_constant(X_train)

model_1 = sm.OLS(y_train_min_real, X_train).fit()
model_2 = sm.OLS(y_train_min_real, X_train_transformed).fit()
model_3 = sm.OLS(y_train_pca, X_train).fit()
model_4 = sm.OLS(y_train_pca, X_train_transformed).fit()


comparison_df = pd.DataFrame({
    'Model': ['Min Real Values + Original Input', 'Min Real Values + PCA Components', 'PCA Real Values + Original Input', 'PCA Real Values + PCA Components'],
    'Adjusted R-squared': [model_1.rsquared_adj, model_2.rsquared_adj, model_3.rsquared_adj, model_4.rsquared_adj],
    'AIC': [model_1.aic, model_2.aic, model_3.aic, model_4.aic],
    'BIC': [model_1.bic, model_2.bic, model_3.bic, model_4.bic],
})

comparison_df_transposed = comparison_df.T
comparison_df_transposed = comparison_df.set_index('Model').T

comparison_df_transposed

In [None]:
plt.figure(figsize=(12, 6))

plt.scatter(y_train_pca, model_3.predict(X_train), c="red")

plt.plot(y_train_pca, y_train_pca, color="blue")

plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.title("Predicted vs Actual values")
plt.show()

In [None]:
X_train_transformed, X_test_transformed, y_train_min_real, y_test_min_real = train_test_split(transformed_input_df, min_values_img, test_size=0.2, random_state=16)
X_train, X_test, y_train_pca, y_test_pca = train_test_split(input_df_scaled, principal_components_img, test_size=0.2, random_state=16)

X_train_transformed = sm.add_constant(X_train_transformed)
X_train = sm.add_constant(X_train)

model_1 = sm.OLS(y_train_min_real, X_train).fit()
model_2 = sm.OLS(y_train_min_real, X_train_transformed).fit()
model_3 = sm.OLS(y_train_pca, X_train).fit()
model_4 = sm.OLS(y_train_pca, X_train_transformed).fit()


comparison_df = pd.DataFrame({
    'Model': ['Min Img. Values + Original Input', 'Min Img. Values + PCA Components', 'PCA Img. Values + Original Input', 'PCA Img. Values + PCA Components'],
    'Adjusted R-squared': [model_1.rsquared_adj, model_2.rsquared_adj, model_3.rsquared_adj, model_4.rsquared_adj],
    'AIC': [model_1.aic, model_2.aic, model_3.aic, model_4.aic],
    'BIC': [model_1.bic, model_2.bic, model_3.bic, model_4.bic],
})

comparison_df_transposed = comparison_df.T
comparison_df_transposed = comparison_df.set_index('Model').T

comparison_df_transposed

In [None]:
plt.figure(figsize=(12, 6))

plt.scatter(y_train_pca, model_3.predict(X_train), c="red")

plt.plot(y_train_pca, y_train_pca, color="blue")

plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.title("Predicted vs Actual values")
plt.show()