In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime, timedelta
import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm
import gc

## UI 1: Drive distribution comparation

In [2]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm

# Set default plot style
sns.set(style='whitegrid')

# Initial date range pickers for data loading
start_date_picker = widgets.DatePicker(
    description='Start Date',
    disabled=False,
    value=datetime(2024, 4, 1)
)

end_date_picker = widgets.DatePicker(
    description='End Date',
    disabled=False,
    value=datetime(2024, 4, 30)
)

# Button to load data
load_data_button = widgets.Button(
    description='Load Data',
    disabled=False,
    button_style='',
    tooltip='Click to load data',
    icon='download'
)

# Two date selectors for comparison
date_selector1 = widgets.DatePicker(
    description='Date 1',
    disabled=True  # Initially disabled until data is loaded
)

date_selector2 = widgets.DatePicker(
    description='Date 2',
    disabled=True  # Initially disabled until data is loaded
)

# Button to compare data for selected dates
compare_button = widgets.Button(
    description='Compare Dates',
    disabled=True,  # Initially disabled until data is loaded
    button_style='',
    tooltip='Click to compare data for the selected dates',
    icon='bar-chart'
)

# Output widgets to display messages, plots, and differences
output_widget = widgets.Output()
plots_output = widgets.Output()
difference_output = widgets.Output()

# Global variable to store loaded data
data_df = pd.DataFrame()

# Define the columns to load
columns_to_load = ['model', 'serial_number', 'failure']

# Function to load data for the initial date range
def load_data(b):
    global data_df
    with output_widget:
        clear_output(wait=True)
        
        # Get the selected start and end dates
        start_date = start_date_picker.value
        end_date = end_date_picker.value
        
        # Input validation
        if None in [start_date, end_date]:
            print("Please select both start and end dates.")
            return
        if start_date > end_date:
            print("Start date must be before or equal to end date.")
            return
        
        # Convert dates to datetime objects
        start_date_dt = pd.to_datetime(start_date)
        end_date_dt = pd.to_datetime(end_date)
        
        # Load data for the date range
        print("Loading data for the selected date range...")
        data_list = []
        date_range = pd.date_range(start_date_dt, end_date_dt)
        for current_date in tqdm(date_range, desc='Loading Data'):
            date_str = current_date.strftime('%Y-%m-%d')
            file_name = f"{date_str}.csv"  
            file_path = os.path.join('D:/Backblaze_Data/data_Q2_2024/', file_name)

            if os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path, usecols=columns_to_load)
                    df['date'] = current_date
                    data_list.append(df)
                except Exception as e:
                    print(f"Error loading {file_name}: {e}")
            else:
                print(f"File not found for date: {date_str}")
        if data_list:
            data_df = pd.concat(data_list, ignore_index=True)
            # Enable date selectors and compare button
            date_selector1.disabled = False
            date_selector2.disabled = False
            compare_button.disabled = False
            # Set the range of date selectors to the loaded data dates
            min_date = data_df['date'].min().date()
            max_date = data_df['date'].max().date()
            date_selector1.min = min_date
            date_selector1.max = max_date
            date_selector2.min = min_date
            date_selector2.max = max_date
            date_selector1.value = min_date
            date_selector2.value = min_date
            print("Data loaded successfully. Please select two dates to compare.")
        else:
            data_df = pd.DataFrame()
            print("No data found for the selected date range.")
            # Disable date selectors and compare button
            date_selector1.disabled = True
            date_selector2.disabled = True
            compare_button.disabled = True

# Function to compare data for the selected dates
def compare_dates(b):
    with output_widget:
        clear_output(wait=True)
        
        if data_df.empty:
            print("No data loaded. Please load data first.")
            return
        
        # Get the selected dates
        selected_date1 = date_selector1.value
        selected_date2 = date_selector2.value
        
        if None in [selected_date1, selected_date2]:
            print("Please select both dates to compare.")
            return
        
        # Convert to datetime
        selected_date1_dt = pd.to_datetime(selected_date1)
        selected_date2_dt = pd.to_datetime(selected_date2)
        
        # Check if selected dates are within the loaded data
        if selected_date1_dt not in data_df['date'].unique() or selected_date2_dt not in data_df['date'].unique():
            print("One or both selected dates are outside the loaded data range.")
            return
        
        # Filter data for the selected dates
        data_date1 = data_df[data_df['date'] == selected_date1_dt]
        data_date2 = data_df[data_df['date'] == selected_date2_dt]
        
        # Process data for Date 1
        drive_counts1 = data_date1.groupby('model')['serial_number'].nunique().reset_index()
        drive_counts1.columns = ['model', 'drive_count']
        
        # Process data for Date 2
        drive_counts2 = data_date2.groupby('model')['serial_number'].nunique().reset_index()
        drive_counts2.columns = ['model', 'drive_count']
        
        # Get the list of all models from both dates
        all_models = pd.Series(list(set(drive_counts1['model']).union(set(drive_counts2['model']))))
        
        # Merge the counts data
        merged_counts = pd.merge(all_models.to_frame('model'), drive_counts1, on='model', how='left')
        merged_counts = pd.merge(merged_counts, drive_counts2, on='model', how='left', suffixes=('_date1', '_date2'))
        merged_counts.fillna(0, inplace=True)
        merged_counts[['drive_count_date1', 'drive_count_date2']] = merged_counts[['drive_count_date1', 'drive_count_date2']].astype(int)
        
        # Calculate differences
        merged_counts['difference'] = merged_counts['drive_count_date2'] - merged_counts['drive_count_date1']
        
        # Sort the models for consistent plotting
        merged_counts.sort_values('model', inplace=True)
        
        # Plot the bar plots
        with plots_output:
            clear_output(wait=True)
            fig, axes = plt.subplots(1, 2, figsize=(16, 6), sharey=True)
            
            sns.barplot(ax=axes[0], x='model', y='drive_count_date1', data=merged_counts)
            axes[0].set_title(f'Number of Drives per Model\n{selected_date1_dt.strftime("%Y-%m-%d")}')
            axes[0].set_xlabel('Model')
            axes[0].set_ylabel('Number of Drives')
            axes[0].tick_params(axis='x', rotation=90)
            # Add numbers above bars
            for index, row in merged_counts.iterrows():
                axes[0].text(index, row['drive_count_date1'], row['drive_count_date1'], color='black', ha="center")
            
            sns.barplot(ax=axes[1], x='model', y='drive_count_date2', data=merged_counts)
            axes[1].set_title(f'Number of Drives per Model\n{selected_date2_dt.strftime("%Y-%m-%d")}')
            axes[1].set_xlabel('Model')
            axes[1].set_ylabel('')
            axes[1].tick_params(axis='x', rotation=90)
            # Add numbers above bars
            for index, row in merged_counts.iterrows():
                axes[1].text(index, row['drive_count_date2'], row['drive_count_date2'], color='black', ha="center")
            
            plt.tight_layout()
            plt.show()
        
        # Display the differences
        with difference_output:
            clear_output(wait=True)
            print("Differences in Number of Drives per Model:")
            for index, row in merged_counts.iterrows():
                difference = row['difference']
                model = row['model']
                if difference > 0:
                    print(f"Model {model}: +{difference} drives")
                elif difference < 0:
                    print(f"Model {model}: {difference} drives")
                # else:
                #     print(f"Model {model}: No change")

# Link the buttons to their functions
load_data_button.on_click(load_data)
compare_button.on_click(compare_dates)

# Arrange the widgets in the UI
ui = widgets.VBox([
    widgets.HTML(value="<b>Select Initial Date Range to Load Data:</b>"),
    widgets.HBox([start_date_picker, end_date_picker]),
    load_data_button,
    output_widget,
    widgets.HTML(value="<b>Select Two Dates to Compare (within the loaded data):</b>"),
    widgets.HBox([date_selector1, date_selector2]),
    compare_button,
    plots_output,
    difference_output
])

# Display the UI
display(ui)


VBox(children=(HTML(value='<b>Select Initial Date Range to Load Data:</b>'), HBox(children=(DatePicker(value=d…

## UI 2: Input distribution difference 

In [3]:
# First date range pickers
start_date_picker1 = widgets.DatePicker(
    description='Start Date 1',
    disabled=False,
    value=datetime(2024, 1, 1)
)

end_date_picker1 = widgets.DatePicker(
    description='End Date 1',
    disabled=False,
    value=datetime(2024, 1, 29)
)

# Second date range pickers
start_date_picker2 = widgets.DatePicker(
    description='Start Date 2',
    disabled=False,
    value=datetime(2024, 2, 1)
)

end_date_picker2 = widgets.DatePicker(
    description='End Date 2',
    disabled=False,
    value=datetime(2024, 2, 29)
)

# Update button (we can keep the same button)
load_data_button = widgets.Button(
    description='Load Data',
    disabled=False,
    button_style='',  
    tooltip='Click to load data',
    icon='download'  # (Optional) FontAwesome icon
)

# Dropdown widget to select a model
model_selector = widgets.Dropdown(
    options=[],
    description='Model:',
    disabled=True,
)

# Button to generate comparison graphs
generate_graphs_button = widgets.Button(
    description='Generate Graphs',
    disabled=True,
    button_style='',
    tooltip='Click to generate comparison graphs',
    icon='bar-chart'  # (Optional) FontAwesome icon
)

# Button to show data used for plotting
show_data_button = widgets.Button(
    description='Show Data',
    disabled=True,
    button_style='',
    tooltip='Click to display data used for plotting',
    icon='table'  # (Optional) FontAwesome icon
)

# Multi-select widget for column selection
column_selector = widgets.SelectMultiple(
    options=[],  # This will be populated once the data is loaded
    description='Columns:',
    disabled=True,
    layout=widgets.Layout(width='50%')
)

In [4]:
# Create an output widget to display the output
output_widget = widgets.Output()

# Output widget to display data
data_output = widgets.Output()

# Output widget to display graphs
graphs_output = widgets.Output()


In [5]:
def load_data(b):
    with output_widget:
        clear_output(wait=True)
        
        # Get the selected dates for both date ranges
        start_date1 = start_date_picker1.value
        end_date1 = end_date_picker1.value
        start_date2 = start_date_picker2.value
        end_date2 = end_date_picker2.value
        
        # Input validation for date ranges
        if None in [start_date1, end_date1, start_date2, end_date2]:
            print("Please select all start and end dates.")
            return
        if start_date1 > end_date1 or start_date2 > end_date2:
            print("Start dates must be before or equal to end dates.")
            return
        
        # Initialize empty lists to store data
        data_list1 = []
        data_list2 = []

        # Define the Columns to Load
        columns_to_load = [
        'date', 'model', 'serial_number',
        'smart_5_normalized', 'smart_5_raw', 'smart_187_normalized', 'smart_187_raw', 'smart_188_normalized', 'smart_188_raw',
        'smart_197_normalized','smart_197_raw', 'smart_198_normalized','smart_198_raw', 'capacity_bytes', 'failure'
        ]
        
        # Define a function to load data for a date range
        def load_data_for_date_range(start_date_dt, end_date_dt, data_list, date_range_label):
            date_range = pd.date_range(start_date_dt, end_date_dt)
            for current_date in tqdm(date_range, desc=f'Loading Data for {date_range_label}'):
                date_str = current_date.strftime('%Y-%m-%d')
                file_name = f"{date_str}.csv"  
                file_path = os.path.join('D:/Backblaze_Data/data_Q2_2024/', file_name)

                if os.path.exists(file_path):
                    try:
                        df = pd.read_csv(file_path, usecols=columns_to_load)
                        df = df[df['failure'] == 1]  # Filter for failed drives
                        if not df.empty:
                            df['date'] = date_str
                            data_list.append(df)
                    except Exception as e:
                        print(f"Error loading {file_name}: {e}")
                else:
                    print(f"File not found for date: {date_str}")
        
        # Convert dates to datetime objects
        start_date_dt1 = pd.to_datetime(start_date1)
        end_date_dt1 = pd.to_datetime(end_date1)
        start_date_dt2 = pd.to_datetime(start_date2)
        end_date_dt2 = pd.to_datetime(end_date2)

        # Load data for both date ranges
        load_data_for_date_range(start_date_dt1, end_date_dt1, data_list1, 'Date Range 1')
        load_data_for_date_range(start_date_dt2, end_date_dt2, data_list2, 'Date Range 2')

        if not data_list1 and not data_list2:
            print("No failed drives found for the selected date ranges.")
            return

        # Concatenate data for both date ranges
        global data_df1, data_df2
        data_df1 = pd.concat(data_list1, ignore_index=True) if data_list1 else pd.DataFrame()
        data_df2 = pd.concat(data_list2, ignore_index=True) if data_list2 else pd.DataFrame()

        # Get the list of models available in both datasets
        models1 = data_df1['model'].unique()
        models2 = data_df2['model'].unique()
        available_models = sorted(list(set(models1) | set(models2)))

        if available_models:
            # Assign data_df1 and data_df2 to all_failed_drives_df1 and all_failed_drives_df2
            all_failed_drives_df1 = data_df1.copy()
            all_failed_drives_df2 = data_df2.copy()
                        
            # Group by model and count the number of failed drives
            failed_drives_by_model_1 = all_failed_drives_df1.groupby('model')['serial_number'].nunique()
            failed_drives_by_model_2 = all_failed_drives_df2.groupby('model')['serial_number'].nunique()
            print("Failed Drives Grouped by Model for date range 1:")
            display(failed_drives_by_model_1)
            print("Failed Drives Grouped by Model for date range 2:")
            display(failed_drives_by_model_2)

            # Update the column selector with available columns from the data
            common_columns = sorted(list(set(data_df1.columns) & set(data_df2.columns)))
            column_selector.options = common_columns  # Set available columns
            column_selector.disabled = False  # Enable column selector
            
            # Update the model selector options
            model_selector.options = available_models
            model_selector.disabled = False
            generate_graphs_button.disabled = False
            show_data_button.disabled = False
            
            print("Data loaded successfully. Please select a model and click 'Generate Graphs'.")
        else:
            print("No models found in the data.")
            model_selector.options = []
            model_selector.disabled = True
            generate_graphs_button.disabled = True
            show_data_button.disabled = True
        


In [6]:
def show_data(b):
    with data_output:
        clear_output(wait=True)
        selected_model = model_selector.value
        selected_columns = list(column_selector.value)  # Get the selected columns
        if not selected_model:
            print("Please select a model to display data.")
            return
        
        if not selected_columns:
            print("Please select at least one column to display.")
            return
        
        # Use the already loaded data
        # Filter data for the selected model
        model_data1 = data_df1[data_df1['model'] == selected_model]
        model_data2 = data_df2[data_df2['model'] == selected_model]
        
        if model_data1.empty and model_data2.empty:
            print(f"No data found for model {selected_model} in both date ranges.")
            return
        
        # Display data
        print(f"Data for model {selected_model} in Date Range 1:")
        display(model_data1[selected_columns] if not model_data1.empty else pd.DataFrame())
        print(f"Data for model {selected_model} in Date Range 2:")
        display(model_data2[selected_columns] if not model_data2.empty else pd.DataFrame())



In [7]:
def generate_kde_plots(b):
    """
    Generates and displays overlaid Kernel Density Estimate (KDE) plots for 
    each SMART attribute for the selected model in the two date ranges.
    """
    with graphs_output:
        clear_output(wait=True)
        selected_model = model_selector.value
        if not selected_model:
            print("Please select a model.")
            return
        
        # Filter data for the selected model
        model_data1 = data_df1[data_df1['model'] == selected_model]
        model_data2 = data_df2[data_df2['model'] == selected_model]

        if model_data1.empty and model_data2.empty:
            print(f"No failed drives found for model {selected_model} in both date ranges.")
            return
        
        # SMART attributes to compare
        attributes = [
            'smart_5_normalized', 'smart_5_raw', 'smart_187_normalized', 'smart_187_raw',
            'smart_188_normalized', 'smart_188_raw', 'smart_197_normalized', 'smart_197_raw', 
            'smart_198_normalized', 'smart_198_raw'
        ]
        
        # Plot overlaid KDE for each attribute
        for attr in attributes:
            # Check if the attribute exists in both datasets and is not completely NaN
            attr_in_data1 = attr in model_data1.columns and not model_data1[attr].isnull().all()
            attr_in_data2 = attr in model_data2.columns and not model_data2[attr].isnull().all()

            if not attr_in_data1 and not attr_in_data2:
                print(f"Attribute {attr} is not available in both date ranges. Skipping.")
                continue  # Skip this attribute if it's not available in both datasets

            # Prepare data for plotting
            data_to_plot = []
            labels = []
            
            if attr_in_data1:
                data_to_plot.append(model_data1[attr].dropna())
                labels.append('Date Range 1')
                
            if attr_in_data2:
                data_to_plot.append(model_data2[attr].dropna())
                labels.append('Date Range 2')

            if data_to_plot:
                # Overlaid KDE plot
                plt.figure(figsize=(10, 6))
                
                for data, label in zip(data_to_plot, labels):
                    sns.kdeplot(data, label=label, fill=True)  # Updated 'fill=True' instead of 'shade=True'
                
                plt.title(f'Overlaid KDE for {attr} ({selected_model})')
                plt.xlabel(attr)
                plt.ylabel('Density')
                plt.legend()
                plt.tight_layout()
                plt.show()


In [8]:
# Link the Show Data button to the function
show_data_button.on_click(show_data)

# Link the button to the function
load_data_button.on_click(load_data)

# Link the KDE plot generation to a new button or the existing graph generation button
generate_graphs_button.on_click(generate_kde_plots)

In [9]:
# UI layout
ui = widgets.VBox([
    widgets.HBox([start_date_picker1, end_date_picker1]),
    widgets.HBox([start_date_picker2, end_date_picker2]),
    load_data_button,
    output_widget,
    model_selector,
    column_selector,  # Added column selector here
    widgets.HBox([generate_graphs_button, show_data_button]),
    graphs_output,
    data_output
])

In [10]:
display(ui)

VBox(children=(HBox(children=(DatePicker(value=datetime.datetime(2024, 1, 1, 0, 0), description='Start Date 1'…

In [11]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from datetime import datetime, timedelta
import ipywidgets as widgets
from IPython.display import display, clear_output
from tqdm.notebook import tqdm
import gc
from sklearn.feature_selection import mutual_info_regression

# Date range pickers
start_date_picker = widgets.DatePicker(
    description='Start Date',
    disabled=False,
    value=datetime(2024, 1, 1)
)

end_date_picker = widgets.DatePicker(
    description='End Date',
    disabled=False,
    value=datetime(2024, 6, 30)
)

# Button to load data
load_data_button = widgets.Button(
    description='Load Data',
    disabled=False,
    button_style='',
    tooltip='Click to load data',
    icon='download'
)

# Output widget to display messages
output_widget = widgets.Output()

# Multi-select widget to select SMART attributes
smart_attributes_list = [
    'smart_5_normalized', 'smart_5_raw',
    'smart_187_normalized', 'smart_187_raw',
    'smart_188_normalized', 'smart_188_raw',
    'smart_197_normalized', 'smart_197_raw',
    'smart_198_normalized', 'smart_198_raw',
    'capacity_bytes'
]

attribute_selector = widgets.SelectMultiple(
    options=smart_attributes_list,
    description='SMART Attributes:',
    disabled=True  # Initially disabled
)

# Button to perform correlation analysis
correlation_button = widgets.Button(
    description='Correlation Analysis',
    disabled=True,
    button_style='',
    tooltip='Click to perform correlation analysis',
    icon='chart-line'
)

# Output widget to display correlation results
correlation_output = widgets.Output()

def load_data(b):
    with output_widget:
        clear_output(wait=True)
        
        # Get the selected dates for the analysis
        start_date = start_date_picker.value
        end_date = end_date_picker.value
        
        # Input validation for date ranges
        if None in [start_date, end_date]:
            print("Please select start and end dates.")
            return
        if start_date > end_date:
            print("Start date must be before or equal to end date.")
            return
        
        # Initialize empty list to store data
        data_list = []

        # Define the Columns to Load
        columns_to_load = [
            'date', 'model', 'serial_number', 'failure',
            'smart_5_normalized', 'smart_5_raw',
            'smart_187_normalized', 'smart_187_raw',
            'smart_188_normalized', 'smart_188_raw',
            'smart_197_normalized', 'smart_197_raw',
            'smart_198_normalized', 'smart_198_raw',
            'capacity_bytes'
        ]
        
        # Load data for the date range
        date_range = pd.date_range(start_date, end_date)
        for current_date in tqdm(date_range, desc='Loading Data'):
            date_str = current_date.strftime('%Y-%m-%d')
            file_name = f"{date_str}.csv"  
            file_path = os.path.join('D:/Backblaze_Data/data_Q2_2024/', file_name)

            if os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path, usecols=columns_to_load)
                    df['date'] = pd.to_datetime(df['date'])
                    data_list.append(df)
                except Exception as e:
                    print(f"Error loading {file_name}: {e}")
            else:
                print(f"File not found for date: {date_str}")
        
        if not data_list:
            print("No data found for the selected date range.")
            return
        
        # Concatenate data
        global data_df
        data_df = pd.concat(data_list, ignore_index=True)
        
        # Clean up to free memory
        del data_list
        gc.collect()
        
        # Identify failed drives and their failure dates
        failure_data = data_df[data_df['failure'] == 1]
        if failure_data.empty:
            print("No failed drives found in the data.")
            return
        
        failed_drives = failure_data[['serial_number', 'date']].drop_duplicates()
        
        # Get data for failed drives
        data_failed_drives = data_df[data_df['serial_number'].isin(failed_drives['serial_number'])]
        data_failed_drives = data_failed_drives.merge(failed_drives, on='serial_number', suffixes=('', '_failure'))
        
        # Ensure 'date' and 'date_failure' are datetime
        data_failed_drives['date'] = pd.to_datetime(data_failed_drives['date'])
        data_failed_drives['date_failure'] = pd.to_datetime(data_failed_drives['date_failure'])
        
        # Calculate 'days till failure'
        data_failed_drives['days_till_failure'] = (data_failed_drives['date_failure'] - data_failed_drives['date']).dt.days
        
        # Filter to last 30 days before failure
        global data_last_30_days
        data_last_30_days = data_failed_drives[(data_failed_drives['days_till_failure'] >= 0) & (data_failed_drives['days_till_failure'] <= 30)]
        
        if data_last_30_days.empty:
            print("No data available for failed drives in the last 30 days before failure.")
            return
        
        # Clean up to free memory
        del data_df, data_failed_drives
        gc.collect()
        
        # Enable the correlation analysis button and attribute selector
        correlation_button.disabled = False
        attribute_selector.disabled = False
        print("Data loaded successfully. Please select SMART attributes and click 'Correlation Analysis'.")

def perform_correlation_analysis(b):
    with correlation_output:
        clear_output(wait=True)
        
        # Get selected SMART attributes
        selected_attributes = list(attribute_selector.value)
        
        if not selected_attributes:
            print("Please select at least one SMART attribute.")
            return
        
        if data_last_30_days.empty:
            print("No data available for failed drives in the last 30 days before failure.")
            return
        
        # Prepare data for correlation
        correlation_columns = ['days_till_failure'] + selected_attributes
        correlation_data = data_last_30_days[correlation_columns].dropna()
        
        if correlation_data.empty:
            print("No data available for correlation analysis after removing missing values.")
            return
        
        # Calculate Spearman correlation
        spearman_corr = correlation_data.corr(method='spearman')['days_till_failure'][selected_attributes]
        
        # Calculate Mutual Information
        X = correlation_data[selected_attributes]
        y = correlation_data['days_till_failure']
        mi = mutual_info_regression(X, y, discrete_features=False)
        mi_series = pd.Series(mi, index=selected_attributes)
        
        # Display correlations
        print("Spearman Correlation between 'Days till Failure' and SMART Attributes:")
        display(spearman_corr)
        
        print("\nMutual Information between 'Days till Failure' and SMART Attributes:")
        display(mi_series)
        
        # Plotting
        plt.figure(figsize=(12, 5))
        plt.subplot(1, 2, 1)
        spearman_corr.plot(kind='bar')
        plt.title("Spearman Correlation")
        plt.ylabel("Correlation Coefficient")
        plt.xlabel("SMART Attributes")
        
        plt.subplot(1, 2, 2)
        mi_series.plot(kind='bar', color='orange')
        plt.title("Mutual Information")
        plt.ylabel("Mutual Information")
        plt.xlabel("SMART Attributes")
        
        plt.tight_layout()
        plt.show()
        
        # Plot scatter plots
        for attr in selected_attributes:
            plt.figure(figsize=(6, 4))
            sns.scatterplot(data=correlation_data, x='days_till_failure', y=attr)
            plt.title(f'{attr} vs Days till Failure')
            plt.xlabel('Days till Failure')
            plt.ylabel(attr)
            plt.show()

# Link the buttons to the functions
load_data_button.on_click(load_data)
correlation_button.on_click(perform_correlation_analysis)

# Arrange the widgets in the UI
ui = widgets.VBox([
    widgets.HBox([start_date_picker, end_date_picker]),
    load_data_button,
    output_widget,
    attribute_selector,
    correlation_button,
    correlation_output
])

# Display the UI
display(ui)


VBox(children=(HBox(children=(DatePicker(value=datetime.datetime(2024, 1, 1, 0, 0), description='Start Date', …

## UI 4 Compare SMART Value changes within a single drive

In [12]:
# Date picker widgets for selecting start and end dates
start_date_picker = widgets.DatePicker(
    description='Start Date',
    disabled=False,
    value=datetime(2024, 4, 1)
)

end_date_picker = widgets.DatePicker(
    description='End Date',
    disabled=False,
    value=datetime(2024, 4, 30)
)

# Dropdown widget to select a model
model_selector = widgets.Dropdown(
    options=[],
    description='Model:',
    disabled=True,
)

# Slider to select duration (10 to 30 days)
duration_slider = widgets.IntSlider(
    value=10,
    min=10,
    max=30,
    step=1,
    description='Duration (days):',
    disabled=True,
)

# Button to load data based on the selected date range
load_data_button = widgets.Button(
    description='Load Data',
    disabled=False,
    button_style='',  # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to load data',
    icon='download'  # (Optional) FontAwesome icon
)

# Dropdown widget to select a drive (serial number)
drive_selector = widgets.Dropdown(
    options=[],
    description='Drive (SN):',
    disabled=True,
)

# Multi-select widget to select SMART attributes
smart_attributes_list = ['smart_5_normalized', 'smart_187_normalized', 'smart_188_normalized', 'smart_197_normalized', 'smart_198_normalized', 'smart_5_raw', 'smart_187_raw', 'smart_188_raw', 'smart_197_raw', 'smart_198_raw']
smart_attribute_selector = widgets.SelectMultiple(
    options=smart_attributes_list,
    value=[smart_attributes_list[0]],
    description='SMART Attributes:',
    disabled=True,
)

# Button to generate graphs
generate_graphs_button = widgets.Button(
    description='Generate Graphs',
    disabled=False,
    button_style='',
    tooltip='Click to generate graphs',
    icon='line-chart'  # FontAwesome icon 4.7 
)

# Button to check drive info
check_info_button = widgets.Button(
    description='Check Information',
    disabled=False,
    button_style='',
    tooltip='Click to check information',
    icon='info-circle'  # FontAwesome icon
)

# Output widget to display messages and lists
output = widgets.Output()

def load_data(b):
    with output:
        clear_output(wait=True)
        start_date = start_date_picker.value
        end_date = end_date_picker.value
        
        if start_date is None or end_date is None:
            print("Please select both start and end dates.")
            return
        
        if start_date > end_date:
            print("Start date must be before or equal to end date.")
            return
        
        # Initialize an empty list to store failed drives information
        failed_drives_list = []
        
        # Convert dates to datetime objects
        start_date_dt = pd.to_datetime(start_date)
        end_date_dt = pd.to_datetime(end_date)
        
        # Generate a list of dates
        date_range = pd.date_range(start_date_dt, end_date_dt)
        
        # Load data files and collect failed drives
        for current_date in tqdm(date_range, desc='Loading Data'):
            date_str = current_date.strftime('%Y-%m-%d')
            file_name = f"{date_str}.csv"
            file_path = os.path.join('D:/Backblaze_Data/data_Q2_2024/', file_name) 
            
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                failed_drives = df[df['failure'] == 1]
                if not failed_drives.empty:
                    failed_drives = failed_drives.copy()
                    failed_drives['date'] = date_str
                    failed_drives_list.append(failed_drives)
            else:
                print(f"File not found for date: {date_str}")
        
        if failed_drives_list:
            # Concatenate all failed drives dataframes into one
            global all_failed_drives_df
            all_failed_drives_df = pd.concat(failed_drives_list, ignore_index=True)
            
            # Group by model and count the number of failed drives
            failed_drives_by_model = all_failed_drives_df.groupby('model')['serial_number'].nunique()
            print("Failed Drives Grouped by Model:")
            display(failed_drives_by_model)
            
            # Update the model selection options
            model_selector.options = failed_drives_by_model.index.tolist()
            
            # Enable the model selector and duration slider
            model_selector.disabled = False
            duration_slider.disabled = False
        else:
            print("No failed drives found in the selected date range.")
            model_selector.options = []
            model_selector.disabled = True
            duration_slider.disabled = True

load_data_button.on_click(load_data)

def update_drive_list(change):
    with output:
        # Get the selected model
        selected_model = model_selector.value
        if selected_model:
            # Filter the DataFrame for the selected model
            model_failed_drives = all_failed_drives_df[all_failed_drives_df['model'] == selected_model]
            # Get the unique serial numbers
            serial_numbers = model_failed_drives['serial_number'].unique()
            # Update the drive selector options
            drive_selector.options = serial_numbers.tolist()
            drive_selector.disabled = False
            smart_attribute_selector.disabled = False
        else:
            drive_selector.options = []
            drive_selector.disabled = True
            smart_attribute_selector.disabled = True

model_selector.observe(update_drive_list, names='value')

graphs_output = widgets.Output()

infos_output = widgets.Output()

def generate_graphs(b):
    with graphs_output:
        clear_output(wait=True)
        selected_model = model_selector.value
        selected_drive = drive_selector.value
        selected_attributes = list(smart_attribute_selector.value)
        duration_days = duration_slider.value
        
        if not selected_model or not selected_drive or not selected_attributes:
            print("Please select a model, drive, and at least one SMART attribute.")
            return
        
        # Get the failure date of the selected drive
        failure_data = all_failed_drives_df[all_failed_drives_df['serial_number'] == selected_drive]
        if not failure_data.empty:
            end_date_dt = pd.to_datetime(failure_data['date'].max())
        else:
            print("No failure information found for the selected drive.")
            return
        
        # Calculate the date range based on the selected duration
        start_date_dt = end_date_dt - timedelta(days=duration_days - 1)
        
        # Generate date range
        date_range = pd.date_range(start_date_dt, end_date_dt)
        
        # Load data for the selected drive over the date range
        drive_data = []
        for current_date in tqdm(date_range, desc='Loading Data'):
            date_str = current_date.strftime('%Y-%m-%d')
            file_name = f"{date_str}.csv"  
            file_path = os.path.join('D:/Backblaze_Data/data_Q2_2024/', file_name)  
            
            if os.path.exists(file_path):
                df = pd.read_csv(file_path)
                # Filter for the selected drive
                drive_row = df[df['serial_number'] == selected_drive]
                if not drive_row.empty:
                    drive_row = drive_row.copy()
                    drive_row['date'] = date_str
                    drive_data.append(drive_row)
            else:
                print(f"File not found for date: {date_str}")
        
        if drive_data:
            drive_df = pd.concat(drive_data, ignore_index=True)
            drive_df['date'] = pd.to_datetime(drive_df['date'])
            drive_df = drive_df.sort_values('date')
            
            # Plot the selected SMART attributes
            for attr in selected_attributes:
                if attr in drive_df.columns:
                    plt.figure(figsize=(12, 6))
                    plt.plot(drive_df['date'], drive_df[attr], marker='o')
                    plt.xlabel('Date')
                    plt.ylabel(attr)
                    plt.title(f'{attr} Over Time for Drive {selected_drive}')
                    plt.grid(True)
                    plt.show()
                else:
                    print(f"Attribute {attr} not found in data.")
        else:
            print("No data found for the selected drive over the specified duration.")

def check_info(b):
    with infos_output:
        clear_output(wait=True)
        selected_model = model_selector.value
        selected_drive = drive_selector.value
                
        if not selected_model or not selected_drive:
            print("Please select a drive")
            return
        failure_data = all_failed_drives_df[all_failed_drives_df['serial_number'] == selected_drive]
        if not failure_data.empty:
            failure_dates = failure_data['date'].unique()
            print(f"Drive {selected_drive} failed on the following date: {failure_dates}")
        else:
            print("No failure information found for the selected drive.")

generate_graphs_button.on_click(generate_graphs)

check_info_button.on_click(check_info)

# Arrange the widgets in the UI
ui = widgets.VBox([
    widgets.HBox([start_date_picker, end_date_picker, load_data_button]),
    output,
    widgets.HBox([model_selector, duration_slider]),
    widgets.HBox([drive_selector, smart_attribute_selector]),  
    widgets.HBox([check_info_button, generate_graphs_button]),
    infos_output,
    graphs_output
])

display(ui)


VBox(children=(HBox(children=(DatePicker(value=datetime.datetime(2024, 4, 1, 0, 0), description='Start Date', …

In [13]:
# Import necessary libraries
import pandas as pd
import os
from datetime import datetime
import ipywidgets as widgets
from IPython.display import display, clear_output

# Date picker widget for selecting a date
date_picker = widgets.DatePicker(
    description='Select Date',
    disabled=False,
    value=datetime(2024, 1, 1)  # Default date
)

# Button to load data and check missing values
check_missing_button = widgets.Button(
    description='Check Missing Values',
    disabled=False,
    button_style='',
    tooltip='Click to load data and check missing values',
    icon='search'  # Optional: FontAwesome icon
)

# Output widget to display messages and results
output_widget = widgets.Output()

def check_missing_values(b):
    with output_widget:
        clear_output(wait=True)
        
        # Get the selected date
        selected_date = date_picker.value
        
        # Input validation
        if selected_date is None:
            print("Please select a date.")
            return
        
        # Convert date to string format
        date_str = selected_date.strftime('%Y-%m-%d')
        
        # Define the file path
        file_name = f"{date_str}.csv"
        file_path = os.path.join('D:/Backblaze_Data/data_Q2_2024/', file_name)
        
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"File not found for date: {date_str}")
            return
        
        # Load the data
        try:
            df = pd.read_csv(file_path)
        except Exception as e:
            print(f"Error loading {file_name}: {e}")
            return
        
        # Check missing values for each column
        missing_values = df.isnull().sum()
        
        # Filter columns with missing values
        columns_with_missing = missing_values[missing_values > 0]
        
        if columns_with_missing.empty:
            print(f"No missing values found in the data for {date_str}.")
        else:
            # Sort the columns by the number of missing values from low to high
            columns_with_missing_sorted = columns_with_missing.sort_values()
            print(f"Missing values for {date_str} (from low to high):")
            for col, num_missing in columns_with_missing_sorted.items():
                print(f"Column '{col}': {num_missing} missing values")

# Link the button to the function
check_missing_button.on_click(check_missing_values)

# Arrange the widgets in the UI
ui = widgets.VBox([
    date_picker,
    check_missing_button,
    output_widget
])

# Display the UI
display(ui)


VBox(children=(DatePicker(value=datetime.datetime(2024, 1, 1, 0, 0), description='Select Date', step=1), Butto…

In [14]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import os
import ipywidgets as widgets
from IPython.display import display, clear_output
from datetime import datetime, timedelta

# Ensure inline plotting
%matplotlib inline

# Date pickers for selecting start and end dates
start_date_picker = widgets.DatePicker(
    description='Start Date',
    disabled=False,
    value=datetime(2024, 1, 1)
)

end_date_picker = widgets.DatePicker(
    description='End Date',
    disabled=False,
    value=datetime(2024, 1, 10)
)

# Button to load data
load_data_button = widgets.Button(
    description='Load Data',
    disabled=False,
    button_style='',
    tooltip='Click to load data',
    icon='download'
)

# Output widget to display messages
output_widget = widgets.Output()

# Initialize date_slider with a placeholder date
placeholder_date = datetime(2024, 1, 1).date()
date_slider = widgets.SelectionSlider(
    options=[placeholder_date],
    description='Select Date:',
    disabled=True,  # Initially disabled until data is loaded
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    layout=widgets.Layout(width='80%')
)

# Output widget to display the pie chart and statistics
chart_output = widgets.Output()

# Global variable to store loaded data
data_df = pd.DataFrame()

# Function to load data
def load_data(b):
    global data_df
    with output_widget:
        clear_output(wait=True)
        
        # Get the selected start and end dates
        start_date = start_date_picker.value
        end_date = end_date_picker.value
        
        # Input validation
        if None in [start_date, end_date]:
            print("Please select both start and end dates.")
            return
        if start_date > end_date:
            print("Start date must be before or equal to end date.")
            return
        
        # Convert dates to datetime objects
        start_date_dt = pd.to_datetime(start_date)
        end_date_dt = pd.to_datetime(end_date)
        
        # Load data for the date range
        print("Loading data for the selected date range...")
        data_list = []
        date_range = pd.date_range(start_date_dt, end_date_dt)
        for current_date in date_range:
            date_str = current_date.strftime('%Y-%m-%d')
            file_name = f"{date_str}.csv"
            file_path = os.path.join('D:/Backblaze_Data/data_Q2_2024/', file_name)
            
            if os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path, usecols=['date', 'model', 'serial_number'])
                    df['date'] = pd.to_datetime(df['date'])
                    data_list.append(df)
                except Exception as e:
                    print(f"Error loading {file_name}: {e}")
            else:
                print(f"File not found for date: {date_str}")
        if data_list:
            data_df = pd.concat(data_list, ignore_index=True)
            # Enable the date slider
            available_dates = sorted(data_df['date'].dt.date.unique())
            if available_dates:
                date_slider.options = available_dates
                date_slider.value = available_dates[0]
                date_slider.disabled = False
                print(f"Data loaded successfully for {len(available_dates)} dates.")
            else:
                print("No available dates found in the data.")
                date_slider.disabled = True
        else:
            data_df = pd.DataFrame()
            print("No data found for the selected date range.")
            # Disable the date slider
            date_slider.disabled = True

# Function to update the pie chart and statistics
def update_pie_chart(change):
    with chart_output:
        clear_output(wait=True)
        
        if data_df.empty:
            print("No data loaded.")
            return
        
        selected_date = date_slider.value
        if selected_date is None:
            print("Please select a date.")
            return
        
        # Filter data for the selected date
        selected_date_dt = pd.to_datetime(selected_date)
        data_on_date = data_df[data_df['date'].dt.date == selected_date_dt.date()]
        
        if data_on_date.empty:
            print(f"No data available for {selected_date_dt.strftime('%Y-%m-%d')}.")
            return
        
        # Calculate model distribution
        model_counts = data_on_date['model'].value_counts()
        total_counts = model_counts.sum()
        percentages = (model_counts / total_counts) * 100
        
        # Function to conditionally label slices
        def autopct_func(pct):
            return ('%1.1f%%' % pct) if pct >= 3 else ''
        
        # Generate labels for the pie chart
        labels = [model if percentage >= 3 else '' for model, percentage in zip(model_counts.index, percentages)]
        
        # Generate labels for the legend with percentages
        legend_labels = [f'{model}: {percentage:.1f}%' for model, percentage in zip(model_counts.index, percentages)]
        
        # Plot pie chart
        plt.figure(figsize=(8, 6))
        wedges, texts, autotexts = plt.pie(model_counts, labels=labels, autopct=autopct_func, startangle=140)
        plt.title(f"Model Distribution on {selected_date_dt.strftime('%Y-%m-%d')}")
        plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
        plt.legend(wedges, legend_labels, title="Models", loc='center left', bbox_to_anchor=(1.05, 0.5))
        plt.tight_layout()
        plt.show()
        
        # Display total number of drives and models
        total_drives = data_on_date['serial_number'].nunique()
        total_models = data_on_date['model'].nunique()
        print(f"Total number of drives: {total_drives}")
        print(f"Total number of models: {total_models}")


# Link the load data button to its function
load_data_button.on_click(load_data)

# Link the date slider to the update function
date_slider.observe(update_pie_chart, names='value')

# Arrange the widgets in the UI
ui = widgets.VBox([
    widgets.HTML(value="<b>Select Date Range to Load Data:</b>"),
    widgets.HBox([start_date_picker, end_date_picker, load_data_button]),
    output_widget,
    widgets.HTML(value="<b>Select Date to View Model Distribution:</b>"),
    date_slider,
    chart_output
])

# Display the UI
display(ui)


VBox(children=(HTML(value='<b>Select Date Range to Load Data:</b>'), HBox(children=(DatePicker(value=datetime.…