In [160]:
import numpy as np
import pandas as pd
import seaborn as sns
import  matplotlib.pyplot as plt 

from scipy.io import arff
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

RANDOM_STATE = 42
np.random.seed = 42

## Part A: Preprocessing and Initial Setup [10 points]

### 1. Data Loading [2]

In [40]:
data, meta = arff.loadarff('./Data/yeast.arff')
df = pd.DataFrame(data)

for col in df.select_dtypes([object]).columns:
    df[col] = df[col].str.decode('utf-8').astype(int)

X = df.iloc[:, :-14]
Y = df.iloc[:, -14:]

### 2. Dimensionality Check

In [46]:
print(f'Number of Features is {df.shape[1]}')
print(f'Number of Data Points is {df.shape[0]}')

Number of Features is 117
Number of Data Points is 2417


### 3. Label Selection for Visualization [5]

In [129]:
# Most frequent single-label combination
single_label_filter = Y[Y.sum(axis=1) == 1]
single_label_ranks = single_label_filter.apply(tuple, axis=1).value_counts()
top_single_class_1 = single_label_ranks.keys()[0]

# Most frequent multi-label combination
multi_label_ranks = Y.apply(tuple, axis=1).value_counts()
top_multi_class_1 = multi_label_ranks.keys()[0]
top_multi_class_2 = multi_label_ranks.keys()[1]

print(f'Most frequent single-label class is: {top_single_class_1}')
print(f'Most frequent multi-label class is: {top_multi_class_1}')
print(f'Second most frequent multi-label class is: {top_multi_class_2}')

color_target = pd.Series('Other', index=Y.index)

top_single_class_1_mask = (Y.apply(tuple, axis=1) == top_single_class_1)
color_target[top_single_class_1_mask] = ' '.join(str(t) for t in top_single_class_1)

top_multi_class_1_mask = (Y.apply(tuple, axis=1) == top_multi_class_1)
color_target[top_multi_class_1_mask] = ' '.join(str(t) for t in top_multi_class_1)

top_multi_class_2_mask = (Y.apply(tuple, axis=1) == top_multi_class_2)
color_target[top_multi_class_2_mask] = ' '.join(str(t) for t in top_multi_class_2)

print("\nNew target variable for coloring:")
print(color_target.value_counts())

Most frequent single-label class is: (1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)
Most frequent multi-label class is: (0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0)
Second most frequent multi-label class is: (0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0)

New target variable for coloring:
Other                          1915
0 0 1 1 0 0 0 0 0 0 0 1 1 0     237
0 0 0 1 1 0 0 0 0 0 0 1 1 0     233
1 0 0 0 0 0 0 0 0 0 0 0 0 0      32
Name: count, dtype: int64


### 4. Scaling [3]

- Scaling is crucial before applying distance-based dimensionality reduction techniques because features with larger scales (e.g., age ranging from 20-80) will disproportionately dominate the distance calculations over features with smaller scales (e.g., a binary flag of 0 or 1).

- This imbalance tricks the algorithm into believing the larger-scale features are more important, leading to a distorted and biased representation of the data's true structure. Scaling ensures that all features contribute equally to the distance metric, resulting in a more accurate and meaningful low-dimensional embedding.

In [138]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print("\n--- Standardized Data ---")
print(X_scaled_df.describe().round(2))


--- Standardized Data ---
          Att1     Att2     Att3     Att4     Att5     Att6     Att7     Att8  \
count  2417.00  2417.00  2417.00  2417.00  2417.00  2417.00  2417.00  2417.00   
mean      0.00    -0.00    -0.00     0.00    -0.00    -0.00    -0.00     0.00   
std       1.00     1.00     1.00     1.00     1.00     1.00     1.00     1.00   
min      -3.82    -4.82    -3.47    -4.83    -3.80    -5.24    -3.30    -6.15   
25%      -0.56    -0.60    -0.59    -0.59    -0.62    -0.62    -0.61    -0.65   
50%       0.03    -0.03     0.03    -0.00     0.04    -0.00     0.05     0.01   
75%       0.58     0.50     0.63     0.56     0.67     0.61     0.70     0.63   
max       5.33     6.28     3.62     5.87     3.16     3.46     3.61     4.69   

          Att9    Att10  ...    Att94    Att95    Att96    Att97    Att98  \
count  2417.00  2417.00  ...  2417.00  2417.00  2417.00  2417.00  2417.00   
mean      0.00     0.00  ...    -0.00    -0.00    -0.00    -0.00    -0.00   
std       1.

## Part B: t-SNE and Veracity Inspection [20 points]

### 1. t-SNE Implementation [5]

In [None]:
perplexities = [5, 15, 25, 70]
fig, axes = plt.subplots(2, 2, figsize=(20, 15), dpi=150)
fig.suptitle('t-SNE with Different Perplexity Values', fontsize=18)

for i, perplexity in enumerate(perplexities):
    tsne = TSNE(n_components=2, perplexity=perplexity, random_state=RANDOM_STATE, init='pca', learning_rate='auto')
    X_tsne = tsne.fit_transform(X_scaled)

    tsne_df = pd.DataFrame(data=X_tsne, columns=['TSNE1', 'TSNE2'])
    tsne_df['label'] = color_target.values
    
    ax = axes[i//2, i%2]
    sns.scatterplot(
            x='TSNE1', y='TSNE2',
            # hue='label',
            # palette=sns.color_palette("hls", n_colors=tsne_df['label'].nunique()),
            data=tsne_df,
            legend="full",
            alpha=0.8,
            ax=ax
        )

    ax.set_title(f'Perplexity = {perplexity}')
    ax.set_xlabel('')
    ax.set_ylabel('')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

### 3. Label Selection for Visualization [5]

In [115]:
Y[top_single_class_1]

KeyError: (1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)

In [68]:
Y

Unnamed: 0,Class1,Class2,Class3,Class4,Class5,Class6,Class7,Class8,Class9,Class10,Class11,Class12,Class13,Class14
0,0,0,0,0,0,0,1,1,0,0,0,1,1,0
1,0,0,1,1,0,0,0,0,0,0,0,0,0,0
2,0,1,1,0,0,0,0,0,0,0,0,1,1,0
3,0,0,1,1,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,1,1,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2412,0,1,1,0,0,0,0,0,0,0,0,0,0,0
2413,1,1,0,0,0,0,0,0,0,0,0,1,1,0
2414,0,0,0,0,0,1,1,1,0,0,0,1,1,0
2415,0,0,0,0,0,0,0,0,0,0,0,1,1,0
