In [None]:
#PCA analysis:

#Explained variance ratio for 20 principal components.

from sklearn.decomposition import PCA
import numpy as np
import pandas as pd
from sklearn import metrics, model_selection, preprocessing
import matplotlib.pyplot as plt


xls1 = pd.ExcelFile('polymer properties.xls')

file_part1  = pd.DataFrame()

num =0 
while num < 109:
    part1 = pd.read_excel(xls1,'descriptors.xls_'+ str(num))
    file_part1 = pd.concat([file_part1,part1], axis=1)
    num +=1

dataset = file_part1.values

#name
A  = dataset [:, 1]
#X is the value of descriptors
X = dataset [:,3:]
#experimental 
Y = dataset [:,2]

min_max_scaler = preprocessing.MinMaxScaler()
X_scale = min_max_scaler.fit_transform(X)

pca = PCA()
X_pca = pca.fit_transform(X_scale)
print(X_pca.shape)
# Calculate explained variance and cumulative explained variance ratio
explained_var = pca.explained_variance_ratio_
cumulative_var = np.cumsum(explained_var)

# Plot percentage variance explained and cumulative explained variance ratio versus number of principal components
num_pcs = 20 # change the number of PCs to be plotted here
fig, ax = plt.subplots(dpi=200)

#ax.plot(np.arange(1, num_pcs + 1), explained_var[:num_pcs], label='Percentage Variance Explained')
bars=ax.bar(np.arange(1, num_pcs + 1), explained_var[:num_pcs], label='Percentage Variance Explained', edgecolor = 'k',color='#76EEC6')
ax.plot(np.arange(1, num_pcs + 1), cumulative_var[:num_pcs], label='Cumulative Explained Variance Ratio', color='k')
ax.set_xlabel('Number of Principal Components',fontsize=12,fontname ='Times New Roman')
ax.set_ylabel('Explained Variance',fontsize=12,fontname ='Times New Roman')
#plt.title("The PCA for $C_{p}$",fontsize=18,fontname ='Times New Roman')
ax.legend(fontsize=7, prop={'family': 'Times New Roman'})

# set X and Y ticks
ax.set_xticks(np.arange(1, num_pcs + 1))
ax.set_yticks(np.arange(0, 1.1, 0.2))
ax.tick_params(axis='both', which='both', direction='in', width=1, length=2)

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.1%}', xy=(rect.get_x() + rect.get_width() / 2, height), xytext=(0, 3),
                    textcoords="offset points", ha='center', va='bottom', fontsize=5)

# Add percentage labels to each bar
autolabel(bars)

plt.show()

In [None]:
#PCA score plot analysis (PC1 vs. PC2)


import matplotlib.pyplot as plt

# Extract the first two principal components from the transformed data
PC1 = X_pca[:, 0]
PC2 = X_pca[:, 1]

fig = plt.figure(dpi=500)

# Create a scatter plot with the first set of points (red)
plt.scatter(PC1[:123], PC2[:123], color="k",  marker='o', facecolors='none', label='$C_{p}$')

# Create a scatter plot with the second set of points (blue)
plt.scatter(PC1[123:133], PC2[123:133], color="#6A5ACD", marker='^', label='$C_{v}$')

# Create a scatter plot with the third set of points (green)
plt.scatter(PC1[133:146], PC2[133:146], color="#FFCE56", marker='*', label='Flexural stress')

# Create a scatter plot with the fourth set of points (yellow)
plt.scatter(PC1[146:164], PC2[146:164], color="#00FF7F", marker='s', label='Shear modulus')

# Create a scatter plot with the fifth set of points (orange)
plt.scatter(PC1[164:], PC2[164:], color="#D02090", marker='D', label="Dynamic viscosity")

# Add labels and a title
plt.xlabel('PC1 (29.1%)', labelpad=12, fontsize=22, fontname='Times New Roman')
plt.ylabel('PC2 (9.1%)', labelpad=12, fontsize=22, fontname='Times New Roman')
#plt.title('PCA plot with PC1 and PC2')
plt.legend(loc='best',fontsize=7, prop={'family': 'Times New Roman'}) 
# Show the plot
plt.show()

In [None]:
#PCA score plot analysis (PC1 vs. PC3)


import matplotlib.pyplot as plt

# Extract the first two principal components from the transformed data
PC1 = X_pca[:, 0]
PC3 = X_pca[:, 2]

fig = plt.figure(dpi=500)

# Create a scatter plot with the first set of points (red)
plt.scatter(PC1[:123], PC3[:123], color="k",  marker='o', facecolors='none', label='$C_{p}$')

# Create a scatter plot with the second set of points (blue)
plt.scatter(PC1[123:133], PC3[123:133], color="#6A5ACD", marker='^', label='$C_{v}$')

# Create a scatter plot with the third set of points (green)
plt.scatter(PC1[133:146], PC3[133:146], color="#FFCE56", marker='*', label='Flexural stress')

# Create a scatter plot with the fourth set of points (yellow)
plt.scatter(PC1[146:164], PC3[146:164], color="#00FF7F", marker='s', label='Shear modulus')

# Create a scatter plot with the fifth set of points (orange)
plt.scatter(PC1[164:], PC3[164:], color="#D02090", marker='D', label="Dynamic viscosity")

# Add labels and a title
plt.xlabel('PC1 (29.1%)', labelpad=12, fontsize=22, fontname='Times New Roman')
plt.ylabel('PC3 (6.2%)', labelpad=12, fontsize=22, fontname='Times New Roman')
#plt.title('PCA plot with PC1 and PC3')
plt.legend(loc='best',fontsize=7, prop={'family': 'Times New Roman'}) 
# Show the plot
plt.show()

In [None]:
#PCA score plot analysis (PC1 vs. PC4)


import matplotlib.pyplot as plt

# Extract the first two principal components from the transformed data
PC1 = X_pca[:, 0]
PC4 = X_pca[:, 3]

fig = plt.figure(dpi=500)

# Create a scatter plot with the first set of points (red)
plt.scatter(PC1[:123], PC4[:123], color="k",  marker='o', facecolors='none', label='$C_{p}$')

# Create a scatter plot with the second set of points (blue)
plt.scatter(PC1[123:133], PC4[123:133], color="#6A5ACD", marker='^', label='$C_{v}$')

# Create a scatter plot with the third set of points (green)
plt.scatter(PC1[133:146], PC4[133:146], color="#FFCE56", marker='*', label='Flexural stress')

# Create a scatter plot with the fourth set of points (yellow)
plt.scatter(PC1[146:164], PC4[146:164], color="#00FF7F", marker='s', label='Shear modulus')

# Create a scatter plot with the fifth set of points (orange)
plt.scatter(PC1[164:], PC4[164:], color="#D02090", marker='D', label="Dynamic viscosity")

# Add labels and a title
plt.xlabel('PC1 (29.1%)', labelpad=12, fontsize=22, fontname='Times New Roman')
plt.ylabel('PC4 (5.1%)', labelpad=12, fontsize=22, fontname='Times New Roman')
#plt.title('PCA plot with PC1 and PC4')
plt.legend(loc='best',fontsize=7, prop={'family': 'Times New Roman'}) 
# Show the plot
plt.show()

In [None]:
#PCA score plot analysis (PC2 vs. PC3)


import matplotlib.pyplot as plt

# Extract the first two principal components from the transformed data
PC2 = X_pca[:, 0]
PC3 = X_pca[:, 3]

fig = plt.figure(dpi=500)

# Create a scatter plot with the first set of points (red)
plt.scatter(PC2[:123], PC3[:123], color="k",  marker='o', facecolors='none', label='$C_{p}$')

# Create a scatter plot with the second set of points (blue)
plt.scatter(PC2[123:133], PC3[123:133], color="#6A5ACD", marker='^', label='$C_{v}$')

# Create a scatter plot with the third set of points (green)
plt.scatter(PC2[133:146], PC3[133:146], color="#FFCE56", marker='*', label='Flexural stress')

# Create a scatter plot with the fourth set of points (yellow)
plt.scatter(PC2[146:164], PC3[146:164], color="#00FF7F", marker='s', label='Shear modulus')

# Create a scatter plot with the fifth set of points (orange)
plt.scatter(PC2[164:], PC3[164:], color="#D02090", marker='D', label="Dynamic viscosity")

# Add labels and a title
plt.xlabel('PC2 (9.1%)', labelpad=12, fontsize=22, fontname='Times New Roman')
plt.ylabel('PC3 (6.2%)', labelpad=12, fontsize=22, fontname='Times New Roman')
#plt.title('PCA plot with PC2 and PC3')
plt.legend(loc='best',fontsize=7, prop={'family': 'Times New Roman'}) 
# Show the plot
plt.show()