In [1]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
data = pd.read_csv('Angry+Birds+(Cardinal+Personalities).xlsx+-+Data+for+assignment.csv')

features = ['Exploration', 'Neophobia', 'Neophilia', 'Aggression', 'Boldness','Personality']
X = data[features]
X

Unnamed: 0,Exploration,Neophobia,Neophilia,Aggression,Boldness,Personality
0,0.87026,1.85823,0.83398,0.43279,-1.35431,Angry Bird
1,1.46306,-0.60086,-0.92405,0.72763,1.08221,Angry Bird
2,1.94799,-0.05401,0.24467,-1.89158,-0.18457,A Pretty Chill Bird
3,0.06201,-0.54020,0.17969,0.04281,1.08080,Standard Bird
4,-0.96196,0.30524,-0.45602,-0.23742,0.56366,A Pretty Chill Bird
...,...,...,...,...,...,...
62,-0.90804,-0.53611,-1.30760,0.43248,-0.71326,A Pretty Chill Bird
63,-0.20749,1.87524,1.29706,0.54875,-0.39362,Standard Bird
64,0.38521,1.39855,-1.21477,0.47695,-1.49095,Standard Bird
65,-0.26138,0.06525,-0.15292,-1.89158,-0.90095,A Pretty Chill Bird


In [3]:
X_train = X.drop(columns='Personality')
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_train)
X_scaled

array([[ 0.87682815,  1.87225552,  0.84027397,  0.43605658, -1.36453172],
       [ 1.47410221, -0.60539536, -0.93102435,  0.73312205,  1.0903783 ],
       [ 1.96269215, -0.05441781,  0.24651629, -1.90585793, -0.18596292],
       [ 0.06247801, -0.54427751,  0.18104587,  0.04313298,  1.08895766],
       [-0.96922024,  0.30754376, -0.45946201, -0.23921221,  0.56791446],
       [ 0.38815761, -0.53572343, -0.6378281 ,  0.1131878 , -1.01777412],
       [-1.02350689, -0.40897391,  0.33124094, -0.79892532, -0.9788324 ],
       [-0.58912301, -0.53510882,  0.50028722, -1.90585793,  0.72397352],
       [-1.186387  ,  0.09950528, -1.000797  , -1.31187811,  0.85110588],
       [ 0.11671428, -0.53868562, -0.80284419, -0.32921648,  0.89702991],
       [-0.69767616, -0.56162748,  1.34613323,  1.33355017,  0.9729184 ],
       [ 1.36543823,  0.04956113, -0.54333024,  1.21953603, -0.99163833],
       [-0.20904592, -0.2346379 ,  0.25279331,  0.29831468, -1.60690739],
       [-1.186387  , -2.95920854,  1.2

In [4]:
# Perform K-Means clustering with 3 clusters
kmeans = KMeans(n_clusters=3, random_state=42)
X['Cluster'] = kmeans.fit_predict(X_scaled)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Cluster'] = kmeans.fit_predict(X_scaled)


In [5]:
# Assuming X is your DataFrame with columns ['Neophilia', 'Aggression', 'Exploration', 'Personality', 'Cluster']

# Create a 3D scatter plot for Personality
fig1 = px.scatter_3d(X, x='Neophilia', y='Aggression', z='Exploration', color='Personality',
                     color_discrete_sequence=px.colors.qualitative.Dark24, size_max=10)
fig1.update_traces(marker=dict(size=5))

# Create a 3D scatter plot for Cluster
fig2 = px.scatter_3d(X, x='Neophilia', y='Aggression', z='Exploration', color='Cluster',
                     color_discrete_sequence=px.colors.qualitative.Dark24, size_max=10)
fig2.update_traces(marker=dict(size=5))

# Customize layout for subplots
fig = make_subplots(rows=1, cols=2, 
                    specs=[[{'type': 'scatter3d'}, {'type': 'scatter3d'}]], 
                    subplot_titles=("K-Means Clustering of Bird Personalities", 
                                    "K-Means Clustering of Bird Clusters"))

# Add first plot to the subplot
fig.add_traces(fig1.data, rows=1, cols=1)

# Add second plot to the subplot
fig.add_traces(fig2.data, rows=1, cols=2)

# Update layout
fig.update_layout(scene=dict(
                        xaxis_title='Neophilia',
                        yaxis_title='Aggression',
                        zaxis_title='Exploration'),
                  scene2=dict(
                        xaxis_title='Neophilia',
                        yaxis_title='Aggression',
                        zaxis_title='Exploration'),
                  height=600, width=1200, title_text="3D PCA Comparison of Personalities and Clusters", 
                  title_x=0.5)

# Show the plot
fig.show()

In [6]:
# Perform PCA to capture 90% of the variance
pca = PCA(n_components=0.90)  # Automatically chooses the number of components to capture 90% variance
X_pca = pca.fit_transform(X_train)

# Create a dataframe for the PCA results
pca_df = pd.DataFrame(data=X_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
pca_df['Personality'] = X['Personality']
pca_df['Cluster'] = X['Cluster']  # Add the original personality labels

# Create a dataframe for the PCA loadings (how much each feature contributes to each PC)
loadings_df = pd.DataFrame(pca.components_.T, columns=[f'PC{i+1}' for i in range(pca.n_components_)], index=X_train.columns)

# Print the explained variance ratio for each component
print("Explained variance ratio by each component:")
for i, ratio in enumerate(pca.explained_variance_ratio_, 1):
    print(f"PC{i}: {ratio:.2%}")

# Print the total number of components used
print(f"\nTotal number of components used to capture 90% of the variance: {pca.n_components_}")

Explained variance ratio by each component:
PC1: 27.59%
PC2: 26.89%
PC3: 20.50%
PC4: 13.32%
PC5: 11.70%

Total number of components used to capture 90% of the variance: 5


In [7]:
loadings_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5
Exploration,-0.288984,-0.435583,0.649387,-0.484298,0.265532
Neophobia,0.138574,-0.559918,-0.566097,-0.504625,-0.303606
Neophilia,0.711646,-0.120649,-0.059768,-0.018357,0.689271
Aggression,-0.612651,0.07889,-0.504235,-0.05185,0.601244
Boldness,0.124516,0.689913,-0.002229,-0.71259,-0.026968


In [8]:
pca_df

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,Personality,Cluster
0,0.165728,-2.420358,-0.751856,-0.431856,0.538487,Angry Bird,1
1,-1.474692,0.614668,0.976158,-1.197285,0.342291,Angry Bird,0
2,0.739592,-1.124353,2.235164,-0.691043,-0.430027,A Pretty Chill Bird,2
3,0.143447,1.002813,0.311339,-0.533118,0.300920,Standard Bird,0
4,0.211404,0.673268,-0.651766,-0.069133,-0.820374,A Pretty Chill Bird,0
...,...,...,...,...,...,...,...
62,-1.096202,0.395496,-0.424508,1.220138,-0.700378,A Pretty Chill Bird,0
63,0.857664,-1.344363,-1.549653,-0.617579,0.610144,Standard Bird,1
64,-1.259854,-1.795303,-0.706132,0.167706,-0.832656,Standard Bird,1
65,1.022447,-0.675036,0.758274,0.836553,-1.307622,A Pretty Chill Bird,2


In [9]:
pca_df.to_csv('pca_df.csv',index=False)

In [10]:
# Assuming `pca_df` contains the columns: ['PC1', 'PC2', 'PC3', 'Cluster', 'Personality']

# Create a 3D scatter plot for Cluster
fig1 = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Cluster',
                     color_discrete_sequence=px.colors.qualitative.Dark24, size_max=10)
fig1.update_traces(marker=dict(size=5))

# Create a 3D scatter plot for Personality
fig2 = px.scatter_3d(pca_df, x='PC1', y='PC2', z='PC3', color='Personality',
                     color_discrete_sequence=px.colors.qualitative.Dark24, size_max=10)
fig2.update_traces(marker=dict(size=5))

# Create subplots
fig = make_subplots(rows=1, cols=2, 
                    specs=[[{'type': 'scatter3d'}, {'type': 'scatter3d'}]], 
                    subplot_titles=(
                        f"PC1 vs PC2 (Explained Variance: {pca.explained_variance_ratio_[0]:.2%}, {pca.explained_variance_ratio_[1]:.2%}) - Cluster",
                        f"PC1 vs PC2 (Explained Variance: {pca.explained_variance_ratio_[0]:.2%}, {pca.explained_variance_ratio_[1]:.2%}) - Personality"
                    ))

# Add the Cluster plot to the first subplot
fig.add_traces(fig1.data, rows=1, cols=1)

# Add the Personality plot to the second subplot
fig.add_traces(fig2.data, rows=1, cols=2)

# Update layout
fig.update_layout(scene=dict(
                        xaxis_title=f'PC1 ({pca.explained_variance_ratio_[0]:.2%} Variance)',
                        yaxis_title=f'PC2 ({pca.explained_variance_ratio_[1]:.2%} Variance)',
                        zaxis_title=f'PC3 ({pca.explained_variance_ratio_[2]:.2%} Variance)'),
                  scene2=dict(
                        xaxis_title=f'PC1 ({pca.explained_variance_ratio_[0]:.2%} Variance)',
                        yaxis_title=f'PC2 ({pca.explained_variance_ratio_[1]:.2%} Variance)',
                        zaxis_title=f'PC3 ({pca.explained_variance_ratio_[2]:.2%} Variance)'),
                  height=600, width=1200, title_text="3D PCA Comparison: Cluster vs Personality", 
                  title_x=0.5)

# Show the plot
fig.show()