In [9]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import copy
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.utils import resample
from joblib import Parallel, delayed
import multiprocessing

#Data Importation and Pre-processing and selecting only the physical sensor features to perform analysis
df1=pd.read_csv(r"C:\Adithya\IITK\IITK EDU\ML Intern\2016WT6_ALL.csv")
df2=pd.read_csv(r"C:\Adithya\IITK\IITK EDU\ML Intern\2017WT6_ALL.csv")
print(df2.info())
keep_cols = [
    'Grd_Prod_Pwr_Avg',
    'Prod_LatestAvg_ActPwrGen0',
    'Prod_LatestAvg_ActPwrGen1',
    'Prod_LatestAvg_TotActPwr',
    'Amb_WindDir_Relative_Avg'
]

# Keeping all columns that are not Grd based power readings except Avg total active power
grd_power_cols = [col for col in df1.columns 
                  if col.startswith('Grd_') and 'Pwr' in col and col != 'Grd_Prod_Pwr_Avg']

# Dropping all columns with Min, Max, Std in name
min_max_std_cols = [col for col in df1.columns if any(stat in col for stat in ['_Min', '_Max', '_Std'])]

# Dropping Nacelle and absolute wind direction columns
unwanted_wind_dir_cols = ['Amb_WindDir_Abs_Avg', 'Nac_Direction_Avg']

# Combining all columns to drop
drop_cols = list(set(grd_power_cols + min_max_std_cols + unwanted_wind_dir_cols + ['Prod_LatestAvg_ActPwrGen2','Turbine_ID','Timestamp','Grd_Prod_PsbleInd_Avg', 'Grd_Prod_PsbleCap_Avg', 'Prod_LatestAvg_TotActPwr', 'Prod_LatestAvg_ReactPwrGen0', 'Prod_LatestAvg_ReactPwrGen1', 'Prod_LatestAvg_ReactPwrGen2', 'Prod_LatestAvg_TotReactPwr', 'Grd_Prod_Pwr_Avg', 'Grd_Prod_PsbleInd_Avg', 'Grd_Prod_PsbleCap_Avg', 'Amb_WindSpeed_Est_Avg', 'Prod_LatestAvg_TotActPwr']))


# reduced DataFrame
df1_reduced = df1.drop(columns=drop_cols)
df2_reduced = df2.drop(columns=drop_cols)

#Handling 1 Nan value 
df2_reduced['Grd_Prod_CosPhi_Avg'] = df2_reduced['Grd_Prod_CosPhi_Avg'].fillna(df2_reduced['Grd_Prod_CosPhi_Avg'].mean())

#Merging both the data
df_reduced = pd.concat([df1_reduced, df2_reduced], axis=0)

print("Basic Feature Information :",'\n',df_reduced.info(),'\n')
print("Basic Statistical summary of the sensors :",'\n',df_reduced.describe(),'\n')

#Correlation Matrix for reference
cm=df_reduced.corr()
plt.figure(figsize=(25,25))
sns.heatmap(cm.abs(),
            annot=True,          # Show values in each cell
            fmt=".2f",           # Format numbers to 2 decimal places
            cmap='viridis',      # Color map (try 'coolwarm', 'YlGnBu', etc.)
            linewidths=0.5,      # Optional: adds borders between cells
            linecolor='gray',
            cbar=True,           # Show colorbar
            square=True,         # Keeps cells square
            annot_kws={"size": 10}  # Annotation font size
           )
plt.xticks(fontsize=10, rotation=90)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.savefig("Correlation_Matrix.png", dpi=300)
plt.close()


def min_max_scale(data):
    data = np.array(data, dtype=float) 
    data_min = np.min(data, axis=0)
    data_max = np.max(data, axis=0)
    return (data - data_min) / (data_max - data_min + 1e-9)  # adding epsilon to avoid division by zero


Sensor_Dataset = min_max_scale(df_reduced)

def elbow_kmeans(data, max_k=5):
    inertias = []
    k_range = range(2, max_k + 1)
    
    for k in k_range:
        km = KMeans(n_clusters=k, n_init=5, random_state=42)
        km.fit(data)
        inertias.append(km.inertia_)

    # Elbow method using second derivative
    deltas = np.diff(inertias)
    second_deriv = np.diff(deltas)
    best_k = np.argmax(second_deriv) + 2

    # Final clustering with best_k
    km_final = KMeans(n_clusters=best_k, n_init=10, random_state=42)
    labels = km_final.fit_predict(data)

    # Organize indices by cluster
    clusters = [[] for _ in range(best_k)]
    for i, label in enumerate(labels):
        clusters[label].append(i)

    return clusters
    

# Function 1: Partition columns, perform KMeans on both sides
def Cluster_Retriever(original_matrix, group_a_idx, group_b_idx, max_k=10):
    group_a_data = original_matrix[:, group_a_idx]
    group_b_data = original_matrix[:, group_b_idx]

    cluster_a = elbow_kmeans(group_a_data, max_k)
    cluster_b = elbow_kmeans(group_b_data, max_k)

    return cluster_a, cluster_b

# Function 2: Mutual information-like score H(P) - H(P|Q)
def MI_Calc(partition_P, partition_Q, n_samples):
    # Entropy H(P)
    H_P = 0.0
    for cluster in partition_P:
        p_i = len(cluster) / n_samples
        if p_i > 0:
            H_P += -p_i * np.log(p_i)

    # Conditional Entropy H(P|Q)
    H_P_given_Q = 0.0
    for q_cluster in partition_Q:
        q_size = len(q_cluster)
        p_q = q_size / n_samples
        q_set = set(q_cluster)
        for p_cluster in partition_P:
            intersect_size = len(q_set.intersection(p_cluster))
            if intersect_size > 0:
                p_i_given_q = intersect_size / q_size
                H_P_given_Q += -p_q * p_i_given_q * np.log(p_i_given_q)

    return H_P - H_P_given_Q

# Function 3: Complete pipeline
def Mutual_Information(Complete_Dataset, group_a_idx, group_b_idx, max_k_to_be_checked=10):
    cluster_a, cluster_b = Cluster_Retriever(Complete_Dataset, group_a_idx, group_b_idx, max_k_to_be_checked)
    score = MI_Calc(cluster_a, cluster_b, n_samples=Complete_Dataset.shape[0])
    return score

#Initialization Phase
Pheromone_Matrix=np.zeros((40,40))

Random_Matrix=np.ones((40,40))
np.fill_diagonal(Random_Matrix,0)

Index_Matrix=np.zeros([40,40])#To store the index in matrix form to initialize and propagate
Nom_dict={}
idx=1
for i in range(40):#Loop to store Nomenclature and make a list of nomenclature as well
    for j in range(40):
        Nom_dict[idx]=(i+1,j+1)
        Index_Matrix[i][j]=idx
        idx+=1

ants=[]
n_ants=200

#Partitioner to generate 1 random partition Biased/Unbiased
def partitioner(Matrix,Index_Matrix):
    n=Matrix.shape[0]
    idx=1
    index=[]
    for i in range(n):#Loop to store Nomenclature and make a list of nomenclature as well
        for j in range(n):
            index.append(idx)
            idx+=1
    probabilities=[]
    for i in range(n):#Loop to create probabilities and custom nom
        for j in range(n):
            probabilities.append(Matrix[i][j]/Matrix.sum())
            idx+=1
    #Randomly picking the Trail
    Index_Trail=np.random.choice(index,1,p=probabilities)#Possibility of selecting one with 0 Trail?
    Index_Trail=Index_Trail[0]
    c=Index_Trail%n
    if c==0:
        r,c=((Index_Trail//n),n)
    else:
        r,c=((Index_Trail//n)+1,Index_Trail%n)
    sensor1,sensor2= Nom_dict[Index_Matrix[r-1][c-1]]
    # print("Sensors Retrieved:",sensor1,sensor2,'\n')
    if((sensor1 not in partition[2]) and (sensor2 not in partition[1]) and((sensor1 not in partition[1]) or (sensor2 not in partition[2]))):
        if(sensor1 in partition[1]):
            partition[2].append(sensor2)
        elif(sensor2 in partition[2]):
            partition[1].append(sensor1)
        else:
            partition[1].append(sensor1)
            partition[2].append(sensor2)
            
#Initialization
while True:
    partition = {1: [], 2: []}
    while True:
        partitioner(Random_Matrix, Index_Matrix)
        if len(partition[1]) + len(partition[2]) == Random_Matrix.shape[0]:
            break

    part1 = sorted(partition[1])
    part2 = sorted(partition[2])
    
    if part1 > part2:
        part1, part2 = part2, part1
        normalized_partition = {1: part1, 2: part2}
    normalized_partition = {1: part1, 2: part2}
    # Check if it's a duplicate
    is_duplicate = False
    for existing in ants:
        if normalized_partition == existing:
            is_duplicate = True
            break

    # Add only if it's unique (make deep copy!)
    if not is_duplicate:
        ants.append(copy.deepcopy(normalized_partition))

    if len(ants) == n_ants:
        break

ants_dict={}
for i in range(len(ants)):
    ants_dict[i+1]=ants[i]
print("Initial ants :",'\n')
for i in ants:
    print (i,'\n')

# Use all cores or set to a specific number
n_jobs = min(multiprocessing.cpu_count(), 16)

def compute_mi_joblib(partition):
    group1 = [x - 1 for x in partition[1]]
    group2 = [x - 1 for x in partition[2]]
    cost = Mutual_Information(Sensor_Dataset, group1, group2, 6)
    return partition, cost


# Run in parallel
mi_results = Parallel(n_jobs=n_jobs)(
    delayed(compute_mi_joblib)(ants[i]) for i in range(len(ants))
)

# Update pheromone matrix based on the initial ants
for partition, cost in mi_results:
    for i in partition[1]:
        for j in partition[2]:
            Pheromone_Matrix[i - 1][j - 1] += cost
            Pheromone_Matrix[j - 1][i - 1] += cost
print("Pheromone Matrix after initial ant pheromone deposits:",Pheromone_Matrix,'\n')

plt.figure(figsize=(25,25))
sns.heatmap(Pheromone_Matrix,
            annot=True,          
            fmt=".2f",           
            cmap='viridis',      
            linewidths=0.5,      
            linecolor='gray',
            cbar=True,       
            square=True,        
            annot_kws={"size": 10}  
           )
plt.xticks(fontsize=10, rotation=90)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.savefig("Pheromone_Matrix_After_Initialization.png", dpi=300)
plt.close()

def create_ant_and_compute_mi(pheromone_matrix, index_matrix, sensor_data):
    # Deep copy inside each worker to avoid race conditions
    partition = {1: [], 2: []}
    while True:
        partitioner(pheromone_matrix, index_matrix)
        if len(partition[1]) + len(partition[2]) == Random_Matrix.shape[0]:
            break

    cost = Mutual_Information(sensor_data, [x - 1 for x in partition[1]], [x - 1 for x in partition[2]], 6)
    return partition, cost
    
#Running ACO based on the Initialization
iterations=500
cost_collection=[]
best_ant_ever_found={}
highest_cost=0
iteration_to_capture=[50,100,150,200,250,300,350,400,450]
max_cost_iteration=[]

for it in range(1, iterations + 1):
    Pheromone_Matrix *= 0.75

    # Run 200 ants in parallel
    results = Parallel(n_jobs=n_jobs)(
        delayed(create_ant_and_compute_mi)(Pheromone_Matrix, Index_Matrix, Sensor_Dataset)
        for _ in range(n_ants)
    )

    ants = []
    costs = []
    
    # Update pheromone matrix
    for partition, cost in results:
        ants.append(partition)
        costs.append(cost)
        for i in partition[1]:
            for j in partition[2]:
                Pheromone_Matrix[i - 1][j - 1] += cost
                Pheromone_Matrix[j - 1][i - 1] += cost

    max_cost = max(costs)
    if max_cost > highest_cost:
        highest_cost = max_cost
        best_ant_ever_found = ants[costs.index(max_cost)]

    max_cost_iteration.append(max_cost)
    cost_collection.append(highest_cost)
    
    # Plot 1: Cumulative Best Cost (Convergence_Plot_1)
    plt.figure(figsize=(30, 30))
    plt.plot(range(1, len(cost_collection) + 1), cost_collection, label="Best So Far")
    plt.xlabel("Iteration")
    plt.ylabel("Cost Function")
    plt.title("Cost Function vs Iteration (Convergence Plot 1)")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig("Convergence_Plot_1.png", dpi=150)
    plt.close()

    # Plot 2: Max Cost Per Iteration (Convergence_Plot_2)
    plt.figure(figsize=(30, 30))
    plt.plot(range(1, len(max_cost_iteration) + 1), max_cost_iteration, label="Max Cost per Iter")
    plt.xlabel("Iteration")
    plt.ylabel("Max Cost This Iteration")
    plt.title("Max Cost per Iteration (Convergence Plot 2)")
    plt.grid(True)
    plt.legend()
    plt.tight_layout()
    plt.savefig("Convergence_Plot_2.png", dpi=150)
    plt.close()

    

    if it in iteration_to_capture:
        plt.figure(figsize=(25, 25))
        sns.heatmap(Pheromone_Matrix,
                    annot=True,
                    fmt=".2f",
                    cmap='viridis',
                    linewidths=0.5,
                    linecolor='gray',
                    cbar=True,
                    square=True,
                    annot_kws={"size": 10})
        plt.xticks(fontsize=10, rotation=90)
        plt.yticks(fontsize=10)
        plt.tight_layout()
        plt.savefig(f"pheromone_matrix_iter_{it}.png", dpi=300)
        plt.close()
        
plt.figure(figsize=(25,25))
sns.heatmap(Pheromone_Matrix,
            annot=True,        
            fmt=".2f",        
            cmap='viridis',   
            linewidths=0.5,  
            linecolor='gray',
            cbar=True,         
            square=True,       
            annot_kws={"size": 10}  
           )
plt.xticks(fontsize=10, rotation=90)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.savefig("Pheromone_Matrix_Final.png", dpi=300)
plt.close()

print("Latest Pheromone Matrix after all the running :", Pheromone_Matrix)
print("Best Solution: ",'\n',best_ant_ever_found,'\n',"Corresponding Cost Function:",max(cost_collection),'\n')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52346 entries, 0 to 52345
Data columns (total 83 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Turbine_ID                   52346 non-null  object 
 1   Timestamp                    52346 non-null  object 
 2   Gen_RPM_Max                  52346 non-null  float64
 3   Gen_RPM_Min                  52346 non-null  float64
 4   Gen_RPM_Avg                  52346 non-null  float64
 5   Gen_RPM_Std                  52346 non-null  float64
 6   Gen_Bear_Temp_Avg            52346 non-null  int64  
 7   Gen_Phase1_Temp_Avg          52346 non-null  int64  
 8   Gen_Phase2_Temp_Avg          52346 non-null  int64  
 9   Gen_Phase3_Temp_Avg          52346 non-null  int64  
 10  Hyd_Oil_Temp_Avg             52346 non-null  int64  
 11  Gear_Oil_Temp_Avg            52346 non-null  int64  
 12  Gear_Bear_Temp_Avg           52346 non-null  int64  
 13  Nac_Temp_Avg    

KeyboardInterrupt: 