In [2]:
# Data Pre-processing: 

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 1: Load the dataset
file_path = 'dataset.csv'
df = pd.read_csv(file_path)
print("Step 1: Initial Dataset Info")
print(df.info(), "\n")

# Step 2: Retain only numerical columns
numerical_df = df.select_dtypes(include=[np.number])
print("Step 2: Numerical Attributes Retained")
print(numerical_df.head(), "\n")

# Step 3: Handle missing values by filling them with the most frequent value
numerical_df = numerical_df.fillna(numerical_df.mode().iloc[0])  # Fill with most frequent value for each column
print("Step 3: Missing Values Filled with Most Frequent Values")
print("Remaining Missing Values (should be 0):")
print(numerical_df.isnull().sum(), "\n")

# Step 4: Standardize the numerical data
scaler = StandardScaler()
numerical_scaled = scaler.fit_transform(numerical_df)
print("Step 4: Data Standardized")
print("Mean (should be ~0):", np.mean(numerical_scaled, axis=0))
print("Std Dev (should be ~1):", np.std(numerical_scaled, axis=0), "\n")

# Step 5: Apply PCA to extract 12 principal components
pca = PCA(n_components=12)
principal_components = pca.fit_transform(numerical_scaled)
explained_variance_ratio = pca.explained_variance_ratio_
print("Step 5: PCA Applied")
print(f"Explained Variance Ratio: {explained_variance_ratio}")
print(f"Total Variance Explained: {explained_variance_ratio.sum():.2%}\n")

# Step 6: Display the top contributing attributes for each principal component
loading_scores = pd.DataFrame(
    data=pca.components_.T,
    columns=['PC1', 'PC2', 'PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10','PC11','PC12'],
    index=numerical_df.columns
)
print("Step 6: Top Contributing Attributes for Each Principal Component")
for i in range(12):
    print(f"\nPrincipal Component {i + 1}:")
    top_features = loading_scores.iloc[:, i].abs().sort_values(ascending=False).head(3)
    print(top_features)


Step 1: Initial Dataset Info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3319 entries, 0 to 3318
Data columns (total 33 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3319 non-null   object 
 1   datetime          3319 non-null   object 
 2   tempmax           3319 non-null   float64
 3   tempmin           3319 non-null   float64
 4   temp              3319 non-null   float64
 5   feelslikemax      3319 non-null   float64
 6   feelslikemin      3319 non-null   float64
 7   feelslike         3319 non-null   float64
 8   dew               3319 non-null   float64
 9   humidity          3319 non-null   float64
 10  precip            3319 non-null   float64
 11  precipprob        3319 non-null   int64  
 12  precipcover       3319 non-null   float64
 13  preciptype        1530 non-null   object 
 14  snow              3319 non-null   float64
 15  snowdepth         3319 non-null   float64
 16  windgust     