In [15]:
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import copy

df=pd.read_csv(r"C:\Adithya\IITK\IITK EDU\ML Intern\2016WT6_ALL.csv")

keep_cols = [
    'Grd_Prod_Pwr_Avg',
    'Prod_LatestAvg_ActPwrGen0',
    'Prod_LatestAvg_ActPwrGen1',
    'Prod_LatestAvg_TotActPwr',
    'Amb_WindDir_Relative_Avg'
]

# Keeping all columns that are not Grd based power readings except Avg total active power
grd_power_cols = [col for col in df.columns 
                  if col.startswith('Grd_') and 'Pwr' in col and col != 'Grd_Prod_Pwr_Avg']

# Dropping all columns with Min, Max, Std in name
min_max_std_cols = [col for col in df.columns if any(stat in col for stat in ['_Min', '_Max', '_Std'])]

# Dropping Nacelle and absolute wind direction columns
unwanted_wind_dir_cols = ['Amb_WindDir_Abs_Avg', 'Nac_Direction_Avg']

# Combining all columns to drop
drop_cols = list(set(grd_power_cols + min_max_std_cols + unwanted_wind_dir_cols + ['Prod_LatestAvg_ActPwrGen2','Turbine_ID','Timestamp','Grd_Prod_PsbleInd_Avg', 'Grd_Prod_PsbleCap_Avg', 'Prod_LatestAvg_TotActPwr', 'Prod_LatestAvg_ReactPwrGen0', 'Prod_LatestAvg_ReactPwrGen1', 'Prod_LatestAvg_ReactPwrGen2', 'Prod_LatestAvg_TotReactPwr', 'Grd_Prod_Pwr_Avg', 'Grd_Prod_PsbleInd_Avg', 'Grd_Prod_PsbleCap_Avg', 'Amb_WindSpeed_Est_Avg', 'Prod_LatestAvg_TotActPwr']))

# reduced DataFrame
df_reduced = df.drop(columns=drop_cols)


# df_reduced.info()
cm=df_reduced.corr().abs()
# plt.figure(figsize=(25,25))
# sns.heatmap(cm, annot=True, cmap="coolwarm", linewidths=.5)

X=np.array(df_reduced)

# Function to perform elbow-based KMeans clustering
def elbow_kmeans(data, max_k=10):
    inertias = []
    for k in range(1, max_k + 1):
        km = KMeans(n_clusters=k, n_init=10, random_state=42)
        km.fit(data)
        inertias.append(km.inertia_)

    deltas = np.diff(inertias)
    second_deriv = np.diff(deltas)
    best_k = np.argmax(second_deriv) + 2  # +2 for second derivative offset

    km_final = KMeans(n_clusters=best_k, n_init=10, random_state=42)
    labels = km_final.fit_predict(data)

    clusters = [[] for _ in range(best_k)]
    for i, label in enumerate(labels):
        clusters[label].append(i)

    return clusters

# Function 1: Partition columns, perform KMeans on both sides
def function1(original_matrix, group_a_idx, group_b_idx, max_k=10):
    group_a_data = original_matrix[:, group_a_idx]
    group_b_data = original_matrix[:, group_b_idx]

    cluster_a = elbow_kmeans(group_a_data, max_k)
    cluster_b = elbow_kmeans(group_b_data, max_k)

    return cluster_a, cluster_b

# Function 2: Mutual information-like score H(P) - H(P|Q)
def function2(partition_P, partition_Q, n_samples=None):

    # Entropy H(P)
    H_P = 0.0
    for cluster in partition_P:
        p_i = len(cluster) / n_samples
        if p_i > 0:
            H_P += -p_i * np.log(p_i)

    # Conditional Entropy H(P|Q)
    H_P_given_Q = 0.0
    for q_cluster in partition_Q:
        q_size = len(q_cluster)
        p_q = q_size / n_samples
        q_set = set(q_cluster)
        for p_cluster in partition_P:
            intersect_size = len(q_set.intersection(p_cluster))
            if intersect_size > 0:
                p_i_given_q = intersect_size / q_size
                H_P_given_Q += -p_q * p_i_given_q * np.log(p_i_given_q)

    return H_P - H_P_given_Q

# Function 3: Complete pipeline
def function3(Complete_Dataset, group_a_idx, group_b_idx, max_k_to_be_checked=10):
    cluster_a, cluster_b = function1(Complete_Dataset, group_a_idx, group_b_idx, max_k_to_be_checked)
    score = function2(cluster_a, cluster_b, n_samples=Complete_Dataset.shape[0])
    return score

partition={1:[1,3,5,7,9,10,11,12,16,18,20,29,30,32,34,37,39],2:[2,4,6,8,13,14,15,17,19,21,22,23,24,25,26,27,28,31,33,35,36,38,40]}
MI=function3(X,[x - 1 for x in partition[1]],[x - 1 for x in partition[2]],50)
print(MI)

0.11935023874038986
