In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv(r"sample_interview_dataset.csv")

In [None]:
# Remove the 'UID' column from the DataFrame
#data.drop(columns=['UID'], inplace=True)

# Fill missing values with median
median_speed = data["Rotational speed [rpm]"].median()
data["Rotational speed [rpm]"].fillna(median_speed, inplace=True)

# Save the modified DataFrame to a CSV file
data.to_csv('clean.csv', index=False)


In [None]:
data.info()

In [None]:
# Get the unique failure types
unique_failure_types = data["Failure Type"].unique()
print(unique_failure_types)

# Create an empty dictionary to store DataFrames for each failure type
failure_type_dataframes = {}

# Iterate over unique failure types
for failure_type in unique_failure_types:
    # Filter the original DataFrame for the current failure type
    failure_type_df = data[data["Failure Type"] == failure_type].copy()

    # Store the filtered DataFrame in the dictionary
    failure_type_dataframes[failure_type] = failure_type_df

power_failure = failure_type_dataframes["Power Failure"]
Error = failure_type_dataframes["Error"]
Tool_Wear_Failure = failure_type_dataframes["Tool Wear Failure"]
Overstrain_Failure = failure_type_dataframes["Overstrain Failure"]
Random_Failures = failure_type_dataframes["Random Failures"]
Heat_Dissipation_Failure = failure_type_dataframes["Heat Dissipation Failure"]
# Access the DataFrame for each failure type using the failure type as key
# For example, to access the DataFrame for failure type "Type1":
# type1_df = failure_type_dataframes["Type1"]


In [None]:
import pandas as pd

# Get unique machine types and failure types
unique_machine_types = data['Machine Type'].unique()
unique_failure_types = data['Failure Type'].unique()

# Create an empty DataFrame with machine types as index and failure types as columns
failure_table_counts = pd.DataFrame(index=unique_machine_types, columns=unique_failure_types)

# Loop through unique machine types and failure types to fill in the counts
for machine_type in unique_machine_types:
    for failure_type in unique_failure_types:
        count = len(data[(data['Machine Type'] == machine_type) & (data['Failure Type'] == failure_type)])
        failure_table_counts.loc[machine_type, failure_type] = count

# Print the table
failure_table_counts.head(1)


In [None]:
import matplotlib.pyplot as plt

# Assuming df_failure, df_power_failure, df_error, df_tool_wear_failure,
# df_overstrain_failure, df_random_failures, df_heat_dissipation_failure are your dataframes

# Calculate the number of rows in each dataframe
counts = [
    len(power_failure),
    len(Tool_Wear_Failure),
    len(Overstrain_Failure),
    len(Random_Failures),
    len(Heat_Dissipation_Failure)
]

# Labels for the pie chart
labels = ['Power Failure', 'Tool Wear Failure',
          'Overstrain Failure', 'Random Failures', 'Heat Dissipation Failure']

# Create the pie chart
plt.figure(figsize=(10, 8))
plt.pie(counts, labels=labels, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Proportions of Different Error Types', pad=20)  # Add padding to the title

# Add a legend and shift it to the right with some padding
plt.legend(labels, loc="center left", bbox_to_anchor=(1, 0.5), title="Error Types", title_fontsize='medium')
plt.subplots_adjust(right=0.7)  # Adjust the layout to create a gap between the pie chart and legend

plt.show()


In [None]:
# Convert DataFrame to numeric values
failure_table_counts_numeric = failure_table_counts.apply(pd.to_numeric, errors='coerce')

# Check for NaN or non-numeric values after conversion
print(failure_table_counts_numeric.isnull().sum())

# Create a heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(failure_table_counts_numeric, annot=True, fmt='d', cmap='YlGnBu', linewidths=0.5, linecolor='black', cbar=True)
plt.xlabel('Failure Types', fontsize=12)
plt.ylabel('Machine Types', fontsize=12)
plt.title('Distribution of Failure Counts', fontsize=14)
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)
plt.tight_layout()
plt.show()


In [None]:
# Calculate percentages excluding "No Failure" category
#failure_table_percentages = failure_table_counts.drop(columns=['No Failure']).div(failure_table_counts.drop(columns=['No Failure']).sum(axis=1), axis=0) * 100
failure_table_counts_without_error = failure_table_counts.drop(columns=['No Failure', 'Error'])
failure_table_percentages = failure_table_counts_without_error.div(failure_table_counts_without_error.sum(axis=1), axis=0) * 100

# Plot percentage stacked bar chart
plt.figure(figsize=(10, 6))
failure_table_percentages.plot(kind='bar', stacked=True, cmap="YlGnBu")
plt.title('Percentage of Failure Types for Each Machine Type (Excluding "Error")')
plt.xlabel('Machine Type')
plt.ylabel('Percentage')
plt.legend(title='Failure Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()


***MACHINE AVAILABILITY INDEX***

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load or define the 'data' DataFrame before executing this code

# Convert 'Timestamp' column to datetime
data['Timestamp'] = pd.to_datetime(data['Timestamp'])

# Calculate time differences between consecutive timestamps for each machine type
data['Time Delta'] = data.groupby('Machine Type')['Timestamp'].diff()

# Handle missing values in 'Time Delta' (e.g., for the first row of each machine type)
data['Time Delta'] = data['Time Delta'].fillna(pd.Timedelta(seconds=0))

# Group by 'Machine Type' and sum 'Time Delta' to get total operating time for each machine type
total_operating_time = data.groupby('Machine Type')['Time Delta'].sum()

# Filter the DataFrame to include rows where Failure Type is not 'No Failure' or 'Error'
failures_df = data[~data['Failure Type'].isin(['No Failure', 'Error'])]

# Calculate failure counts for each machine type
failure_counts = failures_df.groupby('Machine Type').size()

# Convert failure counts to hours (assuming 1 failure = 1 hour)
failure_counts_hours = failure_counts.astype('timedelta64[h]')

# Calculate machine availability index for each machine type
machine_availability_index = (total_operating_time - failure_counts_hours) / total_operating_time * 100

# Handle division by zero errors (e.g., when total operating time is zero)
machine_availability_index = machine_availability_index.fillna(100)  # Assume availability is 100% if no operating time recorded

# Plot the bar chart
plt.figure(figsize=(10, 6))
bars = plt.bar(machine_availability_index.index, machine_availability_index, color='skyblue')

# Add hover values to the bars
for bar in bars:
    height = bar.get_height()
    plt.gca().text(bar.get_x() + bar.get_width() / 2, height * 1.01, f'{height:.2f}%', ha='center', color='black', fontsize=8)

plt.title('Machine Availability Index by Machine Type')
plt.xlabel('Machine Type')
plt.ylabel('Availability Index (%)')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
# Calculate failure rate (failures per hour) for each machine type
failure_rate = failure_counts / total_operating_time_hours

# Print the failure rate for each machine type
print("Failure Rate (failures per hour) by Machine Type:")
print(failure_rate)


ADDING MORE FEATURES

In [None]:
import pandas as pd

# Assuming 'data' is your DataFrame with observations
original_data = data.copy()  # Make a copy of the original data

# Convert 'Timestamp' column to datetime
original_data['Timestamp'] = pd.to_datetime(original_data['Timestamp'])

# Calculate total operating time and total failures for each machine type
original_data['Total Operating Time'] = original_data.groupby('Machine Type')['Timestamp'].diff().dt.total_seconds() / 3600

# Exclude specified types of failures and count them
specified_failures = ['Power Failure', 'Tool Wear Failure', 'Overstrain Failure', 'Random Failures', 'Heat Dissipation Failure']
original_data['Total Failures'] = original_data['Failure Type'].isin(specified_failures).astype(int)

# Calculate machine availability index for each observation
original_data['Machine Availability Index (%)'] = ((original_data['Total Operating Time'] - original_data['Total Failures']) / original_data['Total Operating Time']) * 100

# Calculate failure rate for each observation
original_data['Failure Rate (failures per hour)'] = original_data['Total Failures'] / original_data['Total Operating Time']

# Filter the DataFrame to show only rows where Failure Type is not 'No Failure' and is any of the specified types of failures
specified_failures = ['Power Failure', 'Tool Wear Failure', 'Overstrain Failure', 'Random Failures', 'Heat Dissipation Failure']
failures_df = original_data[original_data['Failure Type'].isin(specified_failures)]

# Display the first 5 rows of the filtered DataFrame
failures_df.head(5)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert 'Timestamp' to datetime if it's not already in datetime format
original_data['Timestamp'] = pd.to_datetime(original_data['Timestamp'])

# Extract year, month, and day from 'Timestamp' and create new columns
original_data['Year'] = original_data['Timestamp'].dt.year
original_data['Month'] = original_data['Timestamp'].dt.month
original_data['Day'] = original_data['Timestamp'].dt.day

# Filter the data for the year 2002
year_2002_data = original_data[original_data['Year'] == 2002]

# Filter the data for the years 2001 and 2003
years_2001_2003_data = original_data[(original_data['Year'] == 2001) | (original_data['Year'] == 2003)]

# Create the plot for 2002 alone
plt.figure(figsize=(10, 6))  # Set the figure size

# Plot availability rate over time for 2002
sns.lineplot(x='Month', y='Machine Availability Index (%)', data=year_2002_data, ci=None)

# Set plot title and labels
plt.title('Availability Rates Over Time (Year 2002)')
plt.xlabel('Month')
plt.ylabel('Availability Rate (%)')

# Show plot
plt.tight_layout()  # Adjust layout to prevent overlapping labels
plt.show()

# Create the plot for 2001 and 2003 combined
plt.figure(figsize=(10, 6))  # Set the figure size

# Plot availability rate over time for 2001 and 2003 combined
sns.lineplot(x='Day', y='Machine Availability Index (%)', hue='Year', data=years_2001_2003_data, ci=None)

# Set plot title and labels
plt.title('Availability Rates Over Time (Years 2001 & 2003)')
plt.xlabel('Day')
plt.ylabel('Availability Rate (%)')

# Add legend
plt.legend(title='Year', loc='upper right')

# Show plot
plt.tight_layout()  # Adjust layout to prevent overlapping labels
plt.show()


**VISUALS**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset

# Define failure type groups
grouped_failure_types = {
    'Tool Wear and Overstrain Failure': ['Tool Wear Failure', 'Overstrain Failure'],
    'Power Failure and Random Failure': ['Power Failure', 'Random Failures'],
    'Heat Dissipation Failure': ['Heat Dissipation Failure']
}

# Filter the data for the year 2002
year_2002_data = data[data['Year'] == 2002]

# Plot time series for each group of failure types
for group_name, failure_types in grouped_failure_types.items():
    # Filter data for the current group of failure types
    group_data = year_2002_data[year_2002_data['Failure Type'].isin(failure_types)]

    # Plot time series
    plt.figure(figsize=(10, 6))  # Set the figure size
    sns.lineplot(x='Month', y='Failure Rate (failures per hour)', hue='Failure Type', data=group_data, ci=None)

    # Set plot title and labels
    plt.title(f'Failure Rates Over Time - {group_name}')
    plt.xlabel('Month')
    plt.ylabel('Failure Rate (failures per hour)')

    # Add legend
    plt.legend(title='Failure Type', loc='upper right')

    # Show plot
    plt.tight_layout()  # Adjust layout to prevent overlapping labels
    plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert 'Timestamp' to datetime if it's not already in datetime format
original_data['Timestamp'] = pd.to_datetime(original_data['Timestamp'])

# Extract year, month, and day from 'Timestamp' and create new columns
original_data['Year'] = original_data['Timestamp'].dt.year
original_data['Month'] = original_data['Timestamp'].dt.month
original_data['Day'] = original_data['Timestamp'].dt.day

# Calculate total operating time for each observation
original_data['Total Operating Time'] = original_data.groupby('Machine Type')['Timestamp'].diff().dt.total_seconds() / 3600

# Calculate the number of failures for each observation
original_data['Number of Failures'] = (original_data['Failure Type'] != 'No Failure').astype(int)

# Calculate failure rate (failures per hour) for each observation
original_data['Failure Rate (failures per hour)'] = original_data['Number of Failures'] / original_data['Total Operating Time']

# Filter data for the year 2002
year_2002_data = original_data[original_data['Year'] == 2002]

# Filter data for the year 2001
year_2001_data = original_data[original_data['Year'] == 2001]

# Filter data for the year 2003
year_2003_data = original_data[original_data['Year'] == 2003]

# Create a plot for the year 2002 by months
plt.figure(figsize=(10, 6))  # Set the figure size
sns.barplot(x='Month', y='Failure Rate (failures per hour)', data=year_2002_data, ci=None)
plt.title('Failure Rate Over Months (Year 2002)')
plt.xlabel('Month')
plt.ylabel('Failure Rate (failures per hour)')
plt.tight_layout()
plt.show()

# Create a plot for the year 2001 by days
plt.figure(figsize=(10, 6))  # Set the figure size
sns.barplot(x='Day', y='Failure Rate (failures per hour)', data=year_2001_data, ci=None)
plt.title('Failure Rate Over Days (Year 2001)')
plt.xlabel('Day')
plt.ylabel('Failure Rate (failures per hour)')
plt.tight_layout()
plt.show()

# Create a plot for the year 2003 by days
plt.figure(figsize=(10, 6))  # Set the figure size
sns.barplot(x='Day', y='Failure Rate (failures per hour)', data=year_2003_data, ci=None)
plt.title('Failure Rate Over Days (Year 2003)')
plt.xlabel('Day')
plt.ylabel('Failure Rate (failures per hour)')
plt.tight_layout()
plt.show()


In [None]:
# Count the total occurrences of each unique machine type
total_machine_type_counts = data["Machine Type"].value_counts()
total_machine_type_counts

In [None]:


# Count the occurrences of failure for each unique machine type
failure_counts = data[data["Failure Type"] != "No Failure"]["Machine Type"].value_counts()

# Calculate the percentage of failures for each unique machine type
failure_percentages = (failure_counts / total_machine_type_counts) * 100

print("Percentage of failures for each unique Machine Type:")
print(failure_percentages)



In [None]:
failure_type_df

In [None]:
# List of failure types
failure_types = ["Error", "Tool Wear Failure", "Overstrain Failure", "Random Failures", "Heat Dissipation Failure"]

# List to store DataFrames for each failure type
failure_dfs = []

# Iterate over failure types and retrieve the corresponding DataFrame from the dictionary
for failure_type in failure_types:
    failure_dfs.append(failure_type_dataframes[failure_type])

# Concatenate the DataFrames along the rows
result_df = pd.concat(failure_dfs)

# Get the unique values in the "Machine Type" column of the concatenated DataFrame
unique_machine_types = result_df["Machine Type"].unique()

print("Unique Machine Types:")
print(unique_machine_types)

# Count the occurrences of each unique machine type
machine_type_counts = result_df["Machine Type"].value_counts()

print("Number of occurrences for each unique Machine Type:")
print(machine_type_counts)




In [None]:
import pandas as pd

# Assuming your data is stored in a DataFrame called df
# Selecting the relevant columns for correlation analysis
selected_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

# Creating a new DataFrame with only the selected columns
selected_data = data[selected_columns]

# Calculating the correlation matrix
correlation_matrix = selected_data.corr()

print(correlation_matrix)


In [None]:
import pandas as pd

# Assuming your data is stored in a DataFrame called df

# Filter data where Target is 0
data_target_0 = data[data['Target'] == 0]

# Filter data where Target is 1
data_target_1 = data[data['Target'] == 1]

# Selecting the relevant columns for correlation analysis
selected_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

# Creating correlation matrices for Target = 0 and Target = 1
correlation_matrix_target_0 = data_target_0[selected_columns].corr()
correlation_matrix_target_1 = data_target_1[selected_columns].corr()

print("Correlation Matrix for Target = 0:")
print(correlation_matrix_target_0)

print("\nCorrelation Matrix for Target = 1:")
print(correlation_matrix_target_1)


In [None]:
import pandas as pd

# Assuming your DataFrame is named 'data'
# Filter data for Machine Type 'x_1'
machine_x1_data = data[data['Machine Type'] == 'x_1']

# Group data by 'Failure Type'
grouped_by_failure = machine_x1_data.groupby('Failure Type')

# Extract data for reference groups ('No Failure' and 'Error')
reference_groups = ['No Failure', 'Error']
reference_data = grouped_by_failure.get_group(reference_groups[0])
for group in reference_groups[1:]:
    reference_data = reference_data.append(grouped_by_failure.get_group(group))

# Selecting the relevant features
features_of_interest = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

# Initialize a dictionary to store differences for each failure type
failure_differences = {}

# Loop through each failure type and calculate the differences
for failure_type, failure_data in grouped_by_failure:
    if failure_type not in reference_groups:
        # Calculate the differences between reference groups and the current failure type for selected features
        differences = failure_data[features_of_interest].mean() - reference_data[features_of_interest].mean()
        # Store the differences in the dictionary
        failure_differences[failure_type] = differences

# Display the differences for each failure type
for failure_type, differences in failure_differences.items():
    print(f"Differences for {failure_type}:")
    print(differences)
    print()  # Add a blank line for readability


In [None]:
import matplotlib.pyplot as plt

# New mean differences for each failure type
new_mean_diff_by_failure_type1 = {
    'Heat Dissipation Failure': {
        'Rotational speed [rpm]': -205.548213,
        'Torque [Nm]': 13.796367,
        'Tool wear [min]': -1.282956
    },
    'Overstrain Failure': {
        'Rotational speed [rpm]': -185.759971,
        'Torque [Nm]': 17.399563,
        'Tool wear [min]': 101.382189
    },
    'Power Failure': {
        'Rotational speed [rpm]': 244.579661,
        'Torque [Nm]': 12.251734,
        'Tool wear [min]': -6.371468
    },
    'Random Failures': {
        'Rotational speed [rpm]': -55.188365,
        'Torque [Nm]': 29.356973,
        'Tool wear [min]': 33.272600
    },
    'Tool Wear Failure': {
        'Rotational speed [rpm]': 26.353454,
        'Torque [Nm]': -7.054300,
        'Tool wear [min]': 107.952600
    }
}

# Define groups of failure types
grouped_failure_types = {
    'Tool Wear Failure and Overstrain Failure': ['Tool Wear Failure', 'Overstrain Failure'],
    'Power Failure and Random Failures': ['Power Failure', 'Random Failures'],
    'Heat Dissipation Failure': ['Heat Dissipation Failure']
}

# Plotting mean differences for each group of failure types
for group_name, failure_types in grouped_failure_types.items():
    plt.figure(figsize=(12, 8))
    for failure_type in failure_types:
        mean_diff_data = new_mean_diff_by_failure_type1[failure_type]
        plt.plot(mean_diff_data.keys(), mean_diff_data.values(), label=failure_type)
    plt.xlabel('Machine Parameter')
    plt.ylabel('Mean Difference')
    plt.title(f'Mean Differences of Machine Parameters for x_1 {group_name}')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd

# Assuming your DataFrame is named 'data'
# Filter data for Machine Type 'x1'
machine_x1_data = data[data['Machine Type'] == 'x1']

# Group data by 'Failure Type'
grouped_by_failure = machine_x1_data.groupby('Failure Type')

# Extract data for reference groups ('No Failure' and 'Error')
reference_groups = ['No Failure', 'Error']
reference_data = grouped_by_failure.get_group(reference_groups[0])
for group in reference_groups[1:]:
    reference_data = reference_data.append(grouped_by_failure.get_group(group))

# Selecting the relevant features
features_of_interest = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

# Initialize a dictionary to store differences for each failure type
failure_differences = {}

# Loop through each failure type and calculate the differences
for failure_type, failure_data in grouped_by_failure:
    if failure_type not in reference_groups:
        # Calculate the differences between reference groups and the current failure type for selected features
        differences = failure_data[features_of_interest].mean() - reference_data[features_of_interest].mean()
        # Store the differences in the dictionary
        failure_differences[failure_type] = differences

# Display the differences for each failure type
for failure_type, differences in failure_differences.items():
    print(f"Differences for {failure_type}:")
    print(differences)
    print()  # Add a blank line for readability


In [None]:
import matplotlib.pyplot as plt

# New mean differences for each failure type
new_mean_diff_by_failure_type2 = {
    'Heat Dissipation Failure': {
        'Rotational speed [rpm]': -191.252161,
        'Torque [Nm]': 13.308665,
        'Tool wear [min]': 6.729819
    },
    'Overstrain Failure': {
        'Rotational speed [rpm]': -234.085494,
        'Torque [Nm]': 19.116998,
        'Tool wear [min]': 93.913152
    },
    'Power Failure': {
        'Rotational speed [rpm]': 225.091925,
        'Torque [Nm]': 7.703288,
        'Tool wear [min]': -8.482009
    },
    'Random Failures': {
        'Rotational speed [rpm]': 5.414506,
        'Torque [Nm]': -6.608002,
        'Tool wear [min]': 6.663152
    },
    'Tool Wear Failure': {
        'Rotational speed [rpm]': 70.843077,
        'Torque [Nm]': -6.815145,
        'Tool wear [min]': 111.234581
    }
}

# Define groups of failure types
grouped_failure_types = {
    'Tool Wear Failure and Overstrain Failure': ['Tool Wear Failure', 'Overstrain Failure'],
    'Power Failure and Random Failures': ['Power Failure', 'Random Failures'],
    'Heat Dissipation Failure': ['Heat Dissipation Failure']
}

# Plotting mean differences for each group of failure types
for group_name, failure_types in grouped_failure_types.items():
    plt.figure(figsize=(12, 8))
    for failure_type in failure_types:
        mean_diff_data = new_mean_diff_by_failure_type2[failure_type]
        plt.plot(mean_diff_data.keys(), mean_diff_data.values(), label=failure_type)
    plt.xlabel('Machine Parameter')
    plt.ylabel('Mean Difference')
    plt.title(f'Mean Differences of Machine Parameters for x1 {group_name}')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
import pandas as pd

# Assuming your DataFrame is named 'data'
# Filter data for Machine Type 'X1'
machine_X1_data = data[data['Machine Type'] == 'X1']

# Group data by 'Failure Type'
grouped_by_failure = machine_X1_data.groupby('Failure Type')

# Extract data for reference groups ('No Failure' and 'Error')
reference_groups = ['No Failure', 'Error']
reference_data = grouped_by_failure.get_group(reference_groups[0])
for group in reference_groups[1:]:
    reference_data = reference_data.append(grouped_by_failure.get_group(group))

# Selecting the relevant features
features_of_interest = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

# Initialize a dictionary to store differences for each failure type
failure_differences = {}

# Loop through each failure type and calculate the differences
for failure_type, failure_data in grouped_by_failure:
    if failure_type not in reference_groups:
        # Calculate the differences between reference groups and the current failure type for selected features
        differences = failure_data[features_of_interest].mean() - reference_data[features_of_interest].mean()
        # Store the differences in the dictionary
        failure_differences[failure_type] = differences

# Display the differences for each failure type
for failure_type, differences in failure_differences.items():
    print(f"Differences for {failure_type}:")
    print(differences)
    print()  # Add a blank line for readability


In [None]:
import matplotlib.pyplot as plt

# Mean differences for each failure type
heat_dissipation = {
    'Air temperature [K]': 3.791813,
    'Process temperature [K]': 1.031658,
    'Rotational speed [rpm]': -219.116081,
    'Torque [Nm]': 11.569728,
    'Tool wear [min]': -18.902850
}

overstrain_failure = {
    'Air temperature [K]': 3.891813,
    'Process temperature [K]': 2.481658,
    'Rotational speed [rpm]': -130.491081,
    'Torque [Nm]': 10.232228,
    'Tool wear [min]': 139.347150
}

power_failure = {
    'Air temperature [K]': 1.531813,
    'Process temperature [K]': 0.241658,
    'Rotational speed [rpm]': 40.108919,
    'Torque [Nm]': 12.792228,
    'Tool wear [min]': 35.147150
}

random_failures = {
    'Air temperature [K]': 1.916813,
    'Process temperature [K]': 0.731658,
    'Rotational speed [rpm]': -35.741081,
    'Torque [Nm]': -2.292772,
    'Tool wear [min]': -51.402850
}

tool_wear_failure = {
    'Air temperature [K]': 0.891813,
    'Process temperature [K]': -0.601675,
    'Rotational speed [rpm]': -49.324414,
    'Torque [Nm]': -2.484439,
    'Tool wear [min]': 114.180484
}

# Define groups of failure types
grouped_failure_types = {
    'Tool Wear Failure and Overstrain Failure': ['mean_diff_tool_wear_failure', 'mean_diff_overstrain_failure'],
    'Power Failure and Random Failures': ['mean_diff_power_failure', 'mean_diff_random_failures'],
    'Heat Dissipation Failure': ['mean_diff_heat_dissipation']
}

# Plotting mean differences for each group of failure types
for group_name, failure_types in grouped_failure_types.items():
    plt.figure(figsize=(12, 8))
    for failure_type in failure_types:
        mean_diff_data = globals()[failure_type]
        plt.plot(mean_diff_data.keys(), mean_diff_data.values(), label=failure_type)
    plt.xlabel('Machine Parameter')
    plt.ylabel('Mean Difference')
    plt.title(f'Mean Differences of Machine Parameters for X1 {group_name}')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt

# Mean differences for each failure type
heat_dissipation = {
    'Rotational speed [rpm]': -219.116081,
    'Torque [Nm]': 11.569728,
    'Tool wear [min]': -18.902850
}

overstrain_failure = {
    'Rotational speed [rpm]': -130.491081,
    'Torque [Nm]': 10.232228,
    'Tool wear [min]': 139.347150
}

power_failure = {
    'Rotational speed [rpm]': 40.108919,
    'Torque [Nm]': 12.792228,
    'Tool wear [min]': 35.147150
}

random_failures = {
    'Rotational speed [rpm]': -35.741081,
    'Torque [Nm]': -2.292772,
    'Tool wear [min]': -51.402850
}

tool_wear_failure = {
    'Rotational speed [rpm]': -49.324414,
    'Torque [Nm]': -2.484439,
    'Tool wear [min]': 114.180484
}

# Define groups of failure types
grouped_failure_types = {
    'Tool Wear Failure and Overstrain Failure': ['tool_wear_failure', 'overstrain_failure'],
    'Power Failure and Random Failures': ['power_failure', 'random_failures'],
    'Heat Dissipation Failure': ['heat_dissipation']
}

# Plotting mean differences for each group of failure types
for group_name, failure_types in grouped_failure_types.items():
    plt.figure(figsize=(12, 8))
    for failure_type in failure_types:
        mean_diff_data = globals()[failure_type]
        plt.plot(mean_diff_data.keys(), mean_diff_data.values(), label=failure_type)
    plt.xlabel('Machine Parameter')
    plt.ylabel('Mean Difference')
    plt.title(f'Mean Differences of Machine Parameters for X1 {group_name}')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt

# Mean differences for each failure type
mean_diff_heat_dissipation = {
    'Air temperature [K]': 3.791813,
    'Process temperature [K]': 1.031658,
    'Rotational speed [rpm]': -219.116081,
    'Torque [Nm]': 11.569728,
    'Tool wear [min]': -18.902850
}

mean_diff_overstrain = {
    'Air temperature [K]': 3.891813,
    'Process temperature [K]': 2.481658,
    'Rotational speed [rpm]': -130.491081,
    'Torque [Nm]': 10.232228,
    'Tool wear [min]': 139.347150
}

mean_diff_power_failure = {
    'Air temperature [K]': 1.531813,
    'Process temperature [K]': 0.241658,
    'Rotational speed [rpm]': 40.108919,
    'Torque [Nm]': 12.792228,
    'Tool wear [min]': 35.147150
}

mean_diff_random_failures = {
    'Air temperature [K]': 1.916813,
    'Process temperature [K]': 0.731658,
    'Rotational speed [rpm]': -35.741081,
    'Torque [Nm]': -2.292772,
    'Tool wear [min]': -51.402850
}

mean_diff_tool_wear_failure = {
    'Air temperature [K]': 0.891813,
    'Process temperature [K]': -0.601675,
    'Rotational speed [rpm]': -49.324414,
    'Torque [Nm]': -2.484439,
    'Tool wear [min]': 114.180484
}

# Plotting mean differences over time for each failure type
plt.figure(figsize=(12, 8))

# Plot mean differences for Heat Dissipation Failure
plt.plot(mean_diff_heat_dissipation.keys(), mean_diff_heat_dissipation.values(), label='Heat Dissipation Failure')

# Plot mean differences for Overstrain Failure
plt.plot(mean_diff_overstrain.keys(), mean_diff_overstrain.values(), label='Overstrain Failure')

# Plot mean differences for Power Failure
plt.plot(mean_diff_power_failure.keys(), mean_diff_power_failure.values(), label='Power Failure')

# Plot mean differences for Random Failures
plt.plot(mean_diff_random_failures.keys(), mean_diff_random_failures.values(), label='Random Failures')

# Plot mean differences for Tool Wear Failure
plt.plot(mean_diff_tool_wear_failure.keys(), mean_diff_tool_wear_failure.values(), label='Tool Wear Failure')

plt.xlabel('Machine Parameter')
plt.ylabel('Mean Difference')
plt.title('Mean Differences of Machine Parameters for Different Failure Types for X1')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import pandas as pd

# Assuming your DataFrame is named 'data'
# Filter data for Machine Type 'x_2'
machine_x_2_data = data[data['Machine Type'] == 'x_2']

# Group data by 'Failure Type'
grouped_by_failure = machine_x_2_data.groupby('Failure Type')

# Initialize a list to store reference groups
reference_groups = ['No Failure', 'Error']
reference_data = pd.DataFrame()  # Initialize an empty DataFrame

# Extract data for reference groups if they exist
for group in reference_groups:
    if group in grouped_by_failure.groups:
        reference_data = reference_data.append(grouped_by_failure.get_group(group))

# Selecting the relevant features
features_of_interest = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

# Initialize a dictionary to store differences for each failure type
failure_differences = {}

# Check if there are failure types other than the reference groups
if len(grouped_by_failure) > len(reference_groups):
    # Loop through each failure type and calculate the differences
    for failure_type, failure_data in grouped_by_failure:
        if failure_type not in reference_groups:
            # Calculate the differences between reference groups and the current failure type for selected features
            differences = failure_data[features_of_interest].mean() - reference_data[features_of_interest].mean()
            # Store the differences in the dictionary
            failure_differences[failure_type] = differences

    # Display the differences for each failure type
    for failure_type, differences in failure_differences.items():
        print(f"Differences for {failure_type}:")
        print(differences)
        print()  # Add a blank line for readability
else:
    print("There are no failure types other than the reference groups ('No Failure' and 'Error').")


In [None]:
import matplotlib.pyplot as plt

# Mean differences for each failure type
heat_dissipation = {
    'Rotational speed [rpm]': -219.116081,
    'Torque [Nm]': 11.569728,
    'Tool wear [min]': -18.902850
}

overstrain_failure = {
    'Rotational speed [rpm]': -130.491081,
    'Torque [Nm]': 10.232228,
    'Tool wear [min]': 139.347150
}

power_failure = {
    'Rotational speed [rpm]': 40.108919,
    'Torque [Nm]': 12.792228,
    'Tool wear [min]': 35.147150
}

random_failures = {
    'Rotational speed [rpm]': -35.741081,
    'Torque [Nm]': -2.292772,
    'Tool wear [min]': -51.402850
}

tool_wear_failure = {
    'Rotational speed [rpm]': -49.324414,
    'Torque [Nm]': -2.484439,
    'Tool wear [min]': 114.180484
}

# Define groups of failure types
grouped_failure_types = {
    'Tool Wear Failure and Overstrain Failure': ['tool_wear_failure', 'overstrain_failure'],
    'Power Failure and Random Failures': ['power_failure', 'random_failures'],
    'Heat Dissipation Failure': ['heat_dissipation']
}

# Plotting mean differences for each group of failure types
for group_name, failure_types in grouped_failure_types.items():
    plt.figure(figsize=(12, 8))
    for failure_type in failure_types:
        mean_diff_data = globals()[failure_type]
        plt.plot(mean_diff_data.keys(), mean_diff_data.values(), label=failure_type)
    plt.xlabel('Machine Parameter')
    plt.ylabel('Mean Difference')
    plt.title(f'Mean Differences of Machine Parameters for x_2 {group_name}')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
import matplotlib.pyplot as plt

# Mean differences for each failure type
mean_diff_heat_dissipation = {
    'Air temperature [K]': 3.791813,
    'Process temperature [K]': 1.031658,
    'Rotational speed [rpm]': -219.116081,
    'Torque [Nm]': 11.569728,
    'Tool wear [min]': -18.902850
}

mean_diff_overstrain = {
    'Air temperature [K]': 3.891813,
    'Process temperature [K]': 2.481658,
    'Rotational speed [rpm]': -130.491081,
    'Torque [Nm]': 10.232228,
    'Tool wear [min]': 139.347150
}

mean_diff_power_failure = {
    'Air temperature [K]': 1.531813,
    'Process temperature [K]': 0.241658,
    'Rotational speed [rpm]': 40.108919,
    'Torque [Nm]': 12.792228,
    'Tool wear [min]': 35.147150
}

mean_diff_random_failures = {
    'Air temperature [K]': 1.916813,
    'Process temperature [K]': 0.731658,
    'Rotational speed [rpm]': -35.741081,
    'Torque [Nm]': -2.292772,
    'Tool wear [min]': -51.402850
}

mean_diff_tool_wear_failure = {
    'Air temperature [K]': 0.891813,
    'Process temperature [K]': -0.601675,
    'Rotational speed [rpm]': -49.324414,
    'Torque [Nm]': -2.484439,
    'Tool wear [min]': 114.180484
}

# Plotting mean differences over time for each failure type
plt.figure(figsize=(12, 8))

# Plot mean differences for Heat Dissipation Failure
plt.plot(mean_diff_heat_dissipation.keys(), mean_diff_heat_dissipation.values(), label='Heat Dissipation Failure')

# Plot mean differences for Overstrain Failure
plt.plot(mean_diff_overstrain.keys(), mean_diff_overstrain.values(), label='Overstrain Failure')

# Plot mean differences for Power Failure
plt.plot(mean_diff_power_failure.keys(), mean_diff_power_failure.values(), label='Power Failure')

# Plot mean differences for Random Failures
plt.plot(mean_diff_random_failures.keys(), mean_diff_random_failures.values(), label='Random Failures')

# Plot mean differences for Tool Wear Failure
plt.plot(mean_diff_tool_wear_failure.keys(), mean_diff_tool_wear_failure.values(), label='Tool Wear Failure')

plt.xlabel('Machine Parameter')
plt.ylabel('Mean Difference')
plt.title('Mean Differences of Machine Parameters for Different Failure Types for x_2')
plt.xticks(rotation=45)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import seaborn as sns

# Selecting the relevant columns for correlation analysis
selected_columns = ['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']

# Creating a new DataFrame with only the selected columns
selected_data = data[selected_columns]

# Calculating the correlation matrix
correlation_matrix = selected_data.corr()

# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=.5)
plt.title('Correlation Heatmap of Process Temperature with Other Features')
plt.show()


ANOMALY ANALYSIS

In [None]:
import pandas as pd
from sklearn.ensemble import IsolationForest  # Example anomaly detection technique
from sklearn.preprocessing import StandardScaler  # For feature scaling
import numpy as np



# Select relevant features for anomaly detection (excluding target for now)
sensor_data = data[['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']]

# Check for missing values
print("Missing values per column:")
print(sensor_data.isnull().sum())

# Handle missing values (example: impute with median)
if sensor_data.isnull().sum().any():
    sensor_data = sensor_data.fillna(sensor_data.median())

# Feature scaling (standardization)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(sensor_data)

# Anomaly detection model (example: Isolation Forest)
clf = IsolationForest(contamination=0.01, random_state=42)  # Adjust contamination parameter as needed
clf.fit(scaled_data)

# Anomaly scores for testing data (optional)
anomaly_scores = clf.decision_function(scaled_data)  # For Isolation Forest
# Use appropriate method for other anomaly detection techniques

# Higher scores indicate a higher anomaly likelihood

# Example: Identifying top 10% anomalies based on scores
#anomaly_indices = anomaly_scores.argsort()[-int(0.1 * len(anomaly_scores)):]  # Adjust anomaly percentage

# Analyze data points with high anomaly scores and corresponding timestamps from the original data

# Combine these insights with domain knowledge to assess if they represent genuine machine performance issues
# Assuming a threshold of 0.2 (adjust based on your approach)
anomaly_indices = np.where(anomaly_scores > 0.23)[0]

# Retrieve corresponding data points and timestamps from the original data
anomaly_data = data.iloc[anomaly_indices]

anomaly_data


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn import svm

# Select relevant features for anomaly detection (excluding target for now)
sensor_data = data[['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']]

# Handle missing values (example: impute with mean)
print("Missing values per column:")
print(sensor_data.isnull().sum())

if sensor_data.isnull().sum().any():
    sensor_data = sensor_data.fillna(sensor_data.mean())

# Feature scaling (standardization)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(sensor_data)

# Anomaly Detection with Isolation Forest

# Define Isolation Forest model
clf_isolation_forest = IsolationForest(contamination=0.01, random_state=42)  # Adjust contamination parameter as needed
clf_isolation_forest.fit(scaled_data)

# Anomaly scores for Isolation Forest
anomaly_scores_isolation_forest = clf_isolation_forest.decision_function(scaled_data)  # For Isolation Forest

# Anomaly Detection with One-Class SVM

# Define OC-SVM model
#clf_one_class_svm = svm.OneClassSVM(nu=0.1, kernel='rbf')  # Adjust nu and kernel parameters as needed
#clf_one_class_svm.fit(scaled_data)
# ... (existing code for feature selection and missing value imputation)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(sensor_data)

# One-Class SVM model with scaling
clf_one_class_svm = svm.OneClassSVM(nu=0.1, kernel='rbf')  # Adjust nu and kernel parameters as needed
clf_one_class_svm.fit(scaled_data)

anomaly_scores_one_class_svm = clf_one_class_svm.decision_function(scaled_data)


# Anomaly scores using decision function (lower scores indicate anomalies)
anomaly_scores_one_class_svm = clf_one_class_svm.decision_function(scaled_data)

# Anomaly Identification and Interpretation (example using Isolation Forest scores)

# Example: Identifying top 10% anomalies based on scores (Isolation Forest)
anomaly_indices_isolation_forest = anomaly_scores_isolation_forest.argsort()[-int(0.1 * len(anomaly_scores_isolation_forest)):]  # Adjust anomaly percentage


In [None]:
anomaly_scores_one_class_svm

In [None]:
anomaly_scores_isolation_forest

**FEATURE SELECTION**

In [None]:
import numpy as np

# Define relevant features
relevant_features = ["Air temperature [K]", "Process temperature [K]", "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]

# Extract values for successful runs and other instances
successful_data = data[data['Failure Type'] == 'No Failure'][relevant_features].values
other_data = data[data['Failure Type'] != 'No Failure'][relevant_features].values

# Number of bootstrap samples
num_bootstrap_samples = 1000

# Empty dictionaries to store bootstrap sample means
bootstrap_means_successful = {feature: [] for feature in relevant_features}
bootstrap_means_other = {feature: [] for feature in relevant_features}

# Perform bootstrapping for successful runs
# Set a random seed (optional) for reproducibility
np.random.seed(42)  # Replace 42 with any desired seed value

for _ in range(num_bootstrap_samples):
    # Generate bootstrap sample for successful runs
    bootstrap_sample = successful_data[np.random.choice(len(successful_data), size=len(successful_data), replace=True)]

    # Calculate mean of bootstrap sample for each feature
    for i, feature in enumerate(relevant_features):
        bootstrap_mean = np.mean(bootstrap_sample[:, i])
        bootstrap_means_successful[feature].append(bootstrap_mean)

# Perform bootstrapping for other instances
for _ in range(num_bootstrap_samples):
    # Generate bootstrap sample for other instances
    bootstrap_sample = other_data[np.random.choice(len(other_data), size=len(other_data), replace=True)]

    # Calculate mean of bootstrap sample for each feature
    for i, feature in enumerate(relevant_features):
        bootstrap_mean = np.mean(bootstrap_sample[:, i])
        bootstrap_means_other[feature].append(bootstrap_mean)

# Calculate 95% confidence intervals for each feature
confidence_intervals_successful = {feature: np.percentile(bootstrap_means_successful[feature], [2.5, 97.5]) for feature in relevant_features}
confidence_intervals_other = {feature: np.percentile(bootstrap_means_other[feature], [2.5, 97.5]) for feature in relevant_features}

# Print confidence intervals for each feature
for feature in relevant_features:
    print(f"95% Confidence Interval for Successful Runs ({feature}):", confidence_intervals_successful[feature])
    print(f"95% Confidence Interval for Other Instances ({feature}):", confidence_intervals_other[feature])


In [None]:
from sklearn.inspection import permutation_importance
from sklearn.preprocessing import StandardScaler
from sklearn.dummy import DummyClassifier
import numpy as np
relevant_features = ["Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]
# Define your dataset with failure and no failure groups
successful_data = data[data['Failure Type'] == 'No Failure'][relevant_features].values
other_data = data[data['Failure Type'] != 'No Failure'][relevant_features].values

# Instantiate a dummy classifier (or regressor) as the estimator
dummy_estimator = DummyClassifier(strategy='constant', constant=0)

# Fit the dummy estimator
dummy_estimator.fit(X=np.zeros((len(successful_data), 1)), y=np.zeros(len(successful_data)))

# Scale the data
scaler = StandardScaler()
successful_data_scaled = scaler.fit_transform(successful_data)
other_data_scaled = scaler.transform(other_data)

# Calculate permutation importances for each feature
perm_importances_successful = permutation_importance(estimator=dummy_estimator, X=successful_data_scaled, y=np.zeros(len(successful_data)), n_repeats=30, random_state=42)
perm_importances_other = permutation_importance(estimator=dummy_estimator, X=other_data_scaled, y=np.zeros(len(other_data)), n_repeats=30, random_state=42)

# Get mean importances and standard deviations
mean_importances_successful = perm_importances_successful.importances_mean
mean_importances_other = perm_importances_other.importances_mean

# Print mean importances for each feature
print("Mean Permutation Importances for Successful Runs:")
for feature, importance in zip(relevant_features, mean_importances_successful):
    print(f"{feature}: {importance}")

print("\nMean Permutation Importances for Other Instances:")
for feature, importance in zip(relevant_features, mean_importances_other):
    print(f"{feature}: {importance}")


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define the necessary features (excluding air temperature and process temperature)
necessary_features = ["Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]

# Select the necessary features and the target variable from the dataset
selected_data = data[necessary_features + ["Target"]]

# Compute the correlation matrix
correlation_matrix = selected_data.corr()

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f", annot_kws={"size": 12})
plt.title("Correlation Matrix Heatmap of Necessary Features vs. Target", fontsize=16)
plt.xlabel("Features", fontsize=14)
plt.ylabel("Features", fontsize=14)
plt.xticks(rotation=45)
plt.yticks(rotation=0)
plt.show()


**ANOMALY DETECTION**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest

# Exclude non-numeric columns from the dataset
numeric_data = data.select_dtypes(include=['float64', 'int64'])  # Assuming numeric columns are of type float64 or int64

# Split data into training and testing sets (addressing overfitting)
X_train, X_test, _, _ = train_test_split(numeric_data, numeric_data.index, test_size=0.2, random_state=42)

# Create and fit the Isolation Forest model
isolation_forest = IsolationForest(contamination='auto', random_state=42)
isolation_forest.fit(X_train)

# Predict outliers/anomalies on the testing set (prevents using training data for evaluation)
outlier_preds = isolation_forest.predict(X_test)

# Identify outliers/anomalies (outlier_preds = -1 indicates an outlier) on the testing set
outliers = X_test[outlier_preds == -1]

iso_outliers = pd.DataFrame(outliers, columns=numeric_data.columns)

# Display the outliers DataFrame for testing set
print("DataFrame for identified outliers on testing set:")
print(iso_outliers)


In [None]:
iso_0 = iso_outliers[iso_outliers["Target"] == 1].shape[0]
iso_0

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor

# Exclude non-numeric columns from the dataset
numeric_data = data.select_dtypes(include=['float64', 'int64'])

# Split data into training and testing sets
X_train, X_test, _, _ = train_test_split(numeric_data, numeric_data.index, test_size=0.2, random_state=42)

# Create and fit the LOF model for novelty detection
lof = LocalOutlierFactor(n_neighbors=20, contamination='auto', novelty=True)
lof.fit(X_train)

# Predict outliers/anomalies on the testing set (unseen data)
outlier_preds_lof = lof.predict(X_test)

# Identify outliers/anomalies (outlier_preds_lof = -1 indicates an outlier)
outliers_lof = X_test[outlier_preds_lof == -1]

LOF_outliers = pd.DataFrame(outliers_lof, columns=numeric_data.columns)

# Display the outliers DataFrame for testing set
print("DataFrame for identified outliers on testing set:")
LOF_outliers.info()


In [None]:
LOF_1 = LOF_outliers[LOF_outliers["Target"] == 1].shape[0]
LOF_1

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming your data has a target variable indicating failures (1: failure, 0: normal)
data_with_target = data.copy()  # Avoid modifying the original data

# Select only numeric features (assuming target variable isn't numeric)
numeric_data = data_with_target.select_dtypes(include=['float64', 'int64'])

# Split data into training and testing sets (considering potential class imbalance)
X_train, X_test, y_train, y_test = train_test_split(numeric_data,
                                                  data_with_target['Failure'],
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=data_with_target['Failure'])

from sklearn.svm import OneClassSVM
# One-Class SVM for anomaly detection
clf = OneClassSVM(nu=0.1, kernel='rbf')  # You can experiment with different parameters
clf.fit(X_train)

# Predict outliers/anomalies on the testing set
outlier_preds_svm = clf.predict(X_test)

# Identify outliers/anomalies (outlier_preds_svm = -1 indicates an outlier)
outliers_svm = X_test[outlier_preds_svm == -1]

SVM_outliers = pd.DataFrame(outliers_svm, columns=numeric_data.columns)

# Display the outliers DataFrame for testing set
print("DataFrame for identified outliers using SVM on testing set:")
SVM_outliers.info()



In [None]:
# Calculate percentage of target values in the main dataset
main_target_percentage = data['Target'].value_counts(normalize=True) * 100

# Calculate percentage of target values in the outliers dataset
outliers_target_percentage = LOF_outliers['Target'].value_counts(normalize=True) * 100

# Print the percentages
print("Percentage of target values in the main dataset:")
print(main_target_percentage)
print("\nPercentage of target values in the outliers dataset:")
print(outliers_target_percentage)

In [None]:
# Filter the main dataset to include only instances where the target is 1
target_1_main = data[data['Target'] == 1]
print(len(target_1_main))

# Count the occurrences of target value 1 in the main dataset
total_target_1_main = len(target_1_main)

# Count the occurrences of target value 1 in the outliers dataset
total_target_1_outliers = iso_outliers[LOF_outliers['Target'] == 1]['Target'].count()

print(total_target_1_outliers)

# Calculate the percentage of target value 1 outliers with respect to the total occurrences of target value 1 in the main dataset
percentage_target_1_outliers = (total_target_1_outliers / total_target_1_main) * 100

# Print the percentage
print("Percentage of target value 1 outliers with respect to total occurrences of target value 1 in the main dataset:", percentage_target_1_outliers)
