In [1]:
import seaborn as sns
import numpy as np
import pandas as pd
from scipy.stats import chi2
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from mlxtend.frequent_patterns import apriori, association_rules

def remove_outliers_chi_squared(df, significance_level=0.05):
    # Calculate the chi-squared critical value
    chi_squared_critical_value = chi2.ppf(1 - significance_level, df.shape[1])
    
    # Calculate the Mahalanobis distance for each row
    mean = np.mean(df, axis=0)
    cov = np.cov(df.values, rowvar=False)
    inv_covmat = np.linalg.inv(cov)
    mahalanobis_distances = df.apply(lambda row: np.dot(np.dot((row - mean), inv_covmat), (row - mean).T), axis=1)
    
    # Identify outliers
    outliers = mahalanobis_distances > chi_squared_critical_value
    
    # Remove outliers
    cleaned_df = df[~outliers]
    
    return cleaned_df

In [2]:
def normalize_data(df):
    scaler = MinMaxScaler(feature_range=(0, 100))
    normalized_data = scaler.fit_transform(df)
    normalized_df = pd.DataFrame(normalized_data, columns=df.columns)
    return normalized_df

# Apply the normalization function to the cleaned dataset


In [3]:
def discretize_data_with_lower_bounds(df, bins=20):
    discretized_df = pd.DataFrame()
    for col in df.columns:
        # Get the bin edges
        bin_edges = np.linspace(df[col].min(), df[col].max(), bins + 1)
            
        # Assign each value to the lower bound of its bin
        bin_labels = bin_edges[:-1]
        discretized_col = pd.cut(df[col], bins=bin_edges, labels=bin_labels, include_lowest=True)
            
        discretized_df[col] = discretized_col.astype(float)
        
    return discretized_df

In [4]:
def binarize_columns(df):
    binary_dict = {}
    
    for col in df.columns:
        # Round the values to the nearest integer
        rounded_col = df[col].round().astype(int)
        unique_values = sorted(rounded_col.unique())
        
        for value in unique_values:
            binary_dict[f'{col}_{value}'] = (rounded_col == value).astype(int)
    
    binary_df = pd.DataFrame(binary_dict)
    
    return binary_df

In [5]:
data = pd.read_csv('data/BatteryFeatures.csv')

In [6]:
cleaned_data = remove_outliers_chi_squared(data)
normalized_data = normalize_data(cleaned_data)
discretized_data = discretize_data_with_lower_bounds(normalized_data,bins = 20)
binarized_data = binarize_columns(discretized_data)

In [7]:
target_data =  binarized_data

frequent_itemsets = apriori(target_data, min_support=0.05, use_colnames=True)

rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1)

top_10_rules = rules.sort_values(by='lift', ascending=False).head(10)
print(top_10_rules)

                                antecedents  \
251                         (deltaQ_var_35)   
246  (capFadeCycle2Slope_25, deltaQ_min_45)   
26                          (deltaQ_min_45)   
250                         (deltaQ_min_45)   
247  (capFadeCycle2Slope_25, deltaQ_var_35)   
27                          (deltaQ_var_35)   
463       (cycle_life_25, avgChargeTime_30)   
460    (capFadeCycle2Slope_25, tempIntT_30)   
286         (capFadeCycle2Slope_25, qd2_65)   
291             (capFadeCycle2Intercept_75)   

                                consequents  antecedent support  \
251  (capFadeCycle2Slope_25, deltaQ_min_45)            0.073394   
246                         (deltaQ_var_35)            0.064220   
26                          (deltaQ_var_35)            0.091743   
250  (capFadeCycle2Slope_25, deltaQ_var_35)            0.091743   
247                         (deltaQ_min_45)            0.064220   
27                          (deltaQ_min_45)            0.073394   
463    (capFa

