# Principal Component Analysis

### Libraries

In [ ]:
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation

### Load dataset
Dataset, which is a csv file is loaded.

In [ ]:
data = pd.read_csv('df_enorm.csv', sep=';', decimal=",", index_col=None)

### Parameters
Parameters needed in the following cells

In [ ]:
Application = 'R'   # R for Regression | C for Classification
n_components = 16
num_features_classif = 1 + n_components     # Total classification features = 5
num_features_regress = 7 + n_components     # Total regression features = 28

X = ''
features = ''
if Application == 'R':
    X = data.iloc[:, 7:num_features_regress]    # Desired columns, this parameter must be changed for each dataset
    features = X.columns
else:
    X = data.iloc[:, 1:num_features_classif]
    features = X.columns

### Elbow rule
To know how many parameters are needed in order to represent a percentage of the total variance.
It is commented because is onlly necessary to run ot once.

In [ ]:
# # Elbow rule
# pca = PCA()
# pca.fit(X)
#
# ############### JUST TO STUDY THE NUMBER OF COMPONENTS ##############
# # Accumulative variance
# cumulative_explained_variance = pca.explained_variance_ratio_.cumsum()
#
# # Elbow rule chart
# plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o', linestyle='--')
# plt.axhline(y=0.80, color="r", linestyle="-")
# plt.axhline(y=0.90, color="purple", linestyle="-")
# plt.axhline(y=0.95, color="orange", linestyle="-")
# plt.xlabel('Number of principal components')
# plt.ylabel('Explained Accumulative Variance')
# plt.title('Principal Components Analysis (PCA) - Elbow rule')
# plt.savefig(f"Features/new_data/PCA/{Application}/Number_of_components.png")
# plt.show()
# ############### JUST TO STUDY THE NUMBER OF COMPONENTS ##############

### Training
In this case, an unsupervised method, as the clustering methods are, will be trained with the  KMeans function from the sklearn.cluster library

In [ ]:
# Train the model
# PCA model
components=[]
n=1
while n<=n_components:
    components.append("PC"+str(n))
    n=n+1

pca = PCA(n_components=n_components)
pc = pca.fit_transform(X)

### Visualizing the results

In [ ]:
# Biplot
plt.scatter(pc[:, 0], pc[:, 1], alpha=0.7)
# Arrow adding
for i, variable in enumerate(pca.components_.T):
    plt.arrow(0, 0, variable[0], variable[1], color='yellow', alpha=0.5)
    plt.text(variable[0] * 1.3, variable[1] * 1.3, f'{features[i]}', color='black', ha='center', va='center', fontsize=3.5)

plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.savefig(f"PCA/{Application}/Biplot_{n_components}Components.png")
plt.show()

### Save the data in a dataframe

In [ ]:
# Dataframe with PCA results
pca_df = pd.DataFrame(data=pc, columns=components)
print(pca_df.head())

### Explore variance per component

In [ ]:
explained_variance = pca.explained_variance_ratio_
print(f'Varianza explicada por cada componente: {explained_variance}')

components_df = pd.DataFrame(pca.components_, columns=features, index=components)
print(components_df)

# Explicative variance chart per component

In [ ]:
plt.bar(features, explained_variance)
plt.xlabel("Principal Components")
plt.ylabel("Explained Variance")
plt.title("Explained Variance per Principal Components")
plt.xticks(fontsize=4.5)
plt.savefig(f"PCA/{Application}/ExplainedComponents_{n_components}Components.png")
plt.show()

### Explicative variance gif per component 
Due to the small size of the arrows, a gif is created for each arrow, so the video shows where it is and the module of the corresponding arrow.

In [ ]:
fig, ax = plt.subplots()
info = 'Biplot' # Arrows or Biplot
def update(frame):
    if info == 'Arrows':
        ax.clear()
    else:
        ax.clear()
        plt.scatter(pc[:, 0], pc[:, 1], alpha=0.7)
    plt.xlabel('Principal Component 1')
    plt.ylabel('Principal Component 2')
    plt.title(features[frame])

    # Draw an arrow per frame
    variable = pca.components_.T[frame]
    ax.arrow(0, 0, variable[0], variable[1], color='yellow', alpha=0.5)
    ax.text(variable[0] * 1.3, variable[1] * 1.3, f'{features[frame]}', color='black', ha='center', va='center', fontsize=8)

# Execute the animation
frames = pca.components_.shape[1]
animation = FuncAnimation(fig, update, frames=frames, interval=1000, repeat=False)
animation.save(f'PCA/{Application}/{info}_animation.gif', writer='pillow')