In [47]:
# 1
import pandas as pd

# Load the datasets
historical_data = pd.read_csv('/Users/deepu/Desktop/Mini_Project/Datasets/historical_data.csv')
equipment_failure = pd.read_csv('/Users/deepu/Desktop/Mini_Project/Datasets/equipment_failure.csv')
minor_hazard_rep = pd.read_csv('/Users/deepu/Desktop/Mini_Project/Datasets/minor_hazard_rep.csv')

# Display basic information about each dataset
print("Equipment Failure Dataset Info:")
print(equipment_failure.info())
print(equipment_failure.head())

print("\nHistorical Data Dataset Info:")
print(historical_data.info())
print(historical_data.head())

print("\nMiner Hazard Reports Dataset Info:")
print(minor_hazard_rep.info())
print(minor_hazard_rep.head())


Equipment Failure Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Equipment ID      1000 non-null   object
 1   Failure Date      1000 non-null   object
 2   Type              1000 non-null   object
 3   Maintenance Date  1000 non-null   object
 4   Failure Cause     1000 non-null   object
 5   Risk Level        1000 non-null   object
 6   Severity          1000 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 54.8+ KB
None
  Equipment ID Failure Date                Type Maintenance Date  \
0       E-0001   2022-08-06       Conveyor Belt       2023-07-31   
1       E-0002   2023-03-14         Pump System       2023-01-31   
2       E-0003   2024-06-03  Ventilation System       2023-05-04   
3       E-0004   2024-04-24    Drilling Machine       2021-12-02   
4       E-0005   2023-02-13  Ventilation System      

In [49]:
# 2
# Drop rows with missing values in all datasets
equipment_failure_cleaned = equipment_failure.dropna()
historical_data_cleaned = historical_data.dropna()
minor_hazard_rep_cleaned = minor_hazard_rep.dropna()

# Check for missing values after cleaning
print("Equipment Failure Dataset Missing Values:\n", equipment_failure_cleaned.isnull().sum())
print("Historical Data Dataset Missing Values:\n", historical_data_cleaned.isnull().sum())
print("Miner Hazard Reports Dataset Missing Values:\n", minor_hazard_rep_cleaned.isnull().sum())

Equipment Failure Dataset Missing Values:
 Equipment ID        0
Failure Date        0
Type                0
Maintenance Date    0
Failure Cause       0
Risk Level          0
Severity            0
dtype: int64
Historical Data Dataset Missing Values:
 Incident ID           0
Date                  0
Location              0
Hazard Type           0
Severity              0
Cause                 0
Failure Cause         0
Equipment Type        0
Risk Level            0
Recommended Action    0
Outcome               0
dtype: int64
Miner Hazard Reports Dataset Missing Values:
 Report ID      0
Timestamp      0
Miner ID       0
Hazard Type    0
Location       0
Severity       0
Description    0
Risk Level     0
Cause          0
dtype: int64


In [51]:
# 3
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Normalize the Severity columns
equipment_failure_cleaned['Severity'] = scaler.fit_transform(equipment_failure_cleaned[['Severity']])
historical_data_cleaned['Severity'] = scaler.fit_transform(historical_data_cleaned[['Severity']])
minor_hazard_rep_cleaned['Severity'] = scaler.fit_transform(minor_hazard_rep_cleaned[['Severity']])

# Check normalized data
print("Normalized Equipment Failure Dataset:\n", equipment_failure_cleaned.head())
print("\nNormalized Historical Data Dataset:\n", historical_data_cleaned.head())
print("\nNormalized Miner Hazard Reports Dataset:\n", minor_hazard_rep_cleaned.head())


Normalized Equipment Failure Dataset:
   Equipment ID Failure Date                Type Maintenance Date  \
0       E-0001   2022-08-06       Conveyor Belt       2023-07-31   
1       E-0002   2023-03-14         Pump System       2023-01-31   
2       E-0003   2024-06-03  Ventilation System       2023-05-04   
3       E-0004   2024-04-24    Drilling Machine       2021-12-02   
4       E-0005   2023-02-13  Ventilation System       2021-07-03   

      Failure Cause Risk Level  Severity  
0      Improper Use       High  0.475344  
1       Wear & Tear       High  0.475344  
2  Electrical Fault     Medium -0.289490  
3  Electrical Fault        Low -1.436740  
4           Unknown       High  0.475344  

Normalized Historical Data Dataset:
   Incident ID        Date  Location          Hazard Type  Severity  \
0      I-0001  2024-03-30  Sector A        Gas Explosion  0.531053   
1      I-0002  2021-12-10  Sector A        Gas Explosion -0.517768   
2      I-0003  2024-06-16  Sector A        Gas

In [53]:
#4 
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers from the Severity columns
equipment_failure_cleaned = remove_outliers(equipment_failure_cleaned, 'Severity')
historical_data_cleaned = remove_outliers(historical_data_cleaned, 'Severity')
minor_hazard_rep_cleaned = remove_outliers(minor_hazard_rep_cleaned, 'Severity')

# Check datasets after outlier removal
print("Equipment Failure Dataset After Outlier Removal:\n", equipment_failure_cleaned.describe())
print("\nHistorical Data Dataset After Outlier Removal:\n", historical_data_cleaned.describe())
print("\nMiner Hazard Reports Dataset After Outlier Removal:\n", minor_hazard_rep_cleaned.describe())

Equipment Failure Dataset After Outlier Removal:
            Severity
count  1.000000e+03
mean   1.065814e-16
std    1.000500e+00
min   -1.436740e+00
25%   -1.436740e+00
50%    4.753441e-01
75%    1.240178e+00
max    1.240178e+00

Historical Data Dataset After Outlier Removal:
            Severity
count  1.000000e+03
mean   3.286260e-17
std    1.000500e+00
min   -1.566588e+00
25%   -8.673744e-01
50%   -1.681609e-01
75%    8.806595e-01
max    1.579873e+00

Miner Hazard Reports Dataset After Outlier Removal:
            Severity
count  1.000000e+03
mean  -1.412204e-16
std    1.000500e+00
min   -1.537065e+00
25%   -8.324743e-01
50%   -1.278833e-01
75%    9.290032e-01
max    1.633594e+00


In [55]:
# 5
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load the miner hazard report dataset
miner_hazard_rep = pd.read_csv('/Users/deepu/Desktop/Mini_Project/Datasets/minor_hazard_rep.csv')

# Clean the dataset: drop missing values and normalize Severity
miner_hazard_rep_cleaned = miner_hazard_rep.dropna()

# Normalize the Severity column
scaler = StandardScaler()
miner_hazard_rep_cleaned['Severity'] = scaler.fit_transform(miner_hazard_rep_cleaned[['Severity']])

# Check the cleaned dataset
print("Cleaned Miner Hazard Reports Dataset:\n", miner_hazard_rep_cleaned.head())


Cleaned Miner Hazard Reports Dataset:
   Report ID            Timestamp Miner ID             Hazard Type  Location  \
0    R-0001  2024-04-01 02:50:20   M-0405               Fire Risk  Sector D   
1    R-0002  2024-04-23 15:55:31   M-0018  Structural Instability  Sector D   
2    R-0003  2024-11-17 03:56:36   M-0141               Fire Risk  Sector C   
3    R-0004  2024-03-15 15:28:12   M-0444   Equipment Malfunction  Sector A   
4    R-0005  2024-10-31 17:38:12   M-0042  Structural Instability  Sector D   

   Severity                       Description Risk Level                Cause  
0  0.576708     Equipment making loud noises.       High    Equipment Failure  
1  0.929003  High temperature near equipment.       High    Equipment Failure  
2  1.281299       Detected unusual gas smell.   Critical        Gas Explosion  
3 -0.127883  Noticed cracks in the structure.     Medium  Structural Collapse  
4  1.633594     Equipment making loud noises.   Critical    Equipment Failure  


In [57]:
import joblib
from sklearn.preprocessing import LabelEncoder

# Features for miner reports
miner_features = miner_hazard_rep_cleaned[['Severity', 'Cause', 'Location', 'Hazard Type']]
miner_target_hazard = miner_hazard_rep_cleaned['Hazard Type']
miner_target_cause = miner_hazard_rep_cleaned['Cause']

# Encode targets
label_encoder_hazard = LabelEncoder()
label_encoder_cause = LabelEncoder()

miner_target_hazard_encoded = label_encoder_hazard.fit_transform(miner_target_hazard)
miner_target_cause_encoded = label_encoder_cause.fit_transform(miner_target_cause)

# Save the label encoders for reuse
joblib.dump(label_encoder_hazard, '/Users/deepu/Desktop/Mini_Project/label_encoder_hazard.pkl')
joblib.dump(label_encoder_cause, '/Users/deepu/Desktop/Mini_Project/label_encoder_cause.pkl')
print("Label encoders saved successfully!")

# Display Encoded Targets
print("Miner Dataset Targets (Hazard Types):\n", miner_target_hazard_encoded[:5])
print("\nMiner Dataset Targets (Causes):\n", miner_target_cause_encoded[:5])


Label encoders saved successfully!
Miner Dataset Targets (Hazard Types):
 [1 3 1 0 3]

Miner Dataset Targets (Causes):
 [0 0 1 2 0]


In [59]:
# Equipment Failure Data
equipment_features = equipment_failure_cleaned[['Severity', 'Risk Level']]

# Use 'Failure Cause' as the target column
if 'Failure Cause' in equipment_failure_cleaned.columns:
    equipment_target_cause = equipment_failure_cleaned['Failure Cause']
    # Encode the target
    equipment_target_cause_encoded = label_encoder_cause.fit_transform(equipment_target_cause)
else:
    print("Warning: 'Failure Cause' column not found in equipment_failure_cleaned. Assigning placeholders.")
    equipment_target_cause_encoded = np.zeros(len(equipment_features), dtype=int)  # Placeholder for missing targets

# Verify encoded target
print("Encoded 'Failure Cause':", equipment_target_cause_encoded[:5])

Encoded 'Failure Cause': [2 5 1 1 4]


In [61]:
print("Columns in Miner Hazard Reports Dataset:\n", miner_hazard_rep_cleaned.columns)

Columns in Miner Hazard Reports Dataset:
 Index(['Report ID', 'Timestamp', 'Miner ID', 'Hazard Type', 'Location',
       'Severity', 'Description', 'Risk Level', 'Cause'],
      dtype='object')


In [63]:
# Load and clean the equipment failure dataset
equipment_failure = pd.read_csv('/Users/deepu/Desktop/Mini_Project/Datasets/equipment_failure.csv')

# Drop missing values
equipment_failure_cleaned = equipment_failure.dropna()

# Select features for training
equipment_features = equipment_failure_cleaned[['Risk Level', 'Severity']]

In [65]:
# Extract features from each cleaned dataset
historical_features = historical_data_cleaned[['Severity', 'Location']].rename(columns={'Severity': 'Severity'})
miner_features = miner_hazard_rep_cleaned[['Severity', 'Location']]
equipment_features = equipment_failure_cleaned[['Severity', 'Risk Level']].rename(columns={'Risk Level': 'Location'})

# Combine aligned features into a single DataFrame
try:
    features = pd.concat([historical_features, miner_features, equipment_features], axis=0).reset_index(drop=True)
    print("Features DataFrame created successfully.")
except Exception as e:
    print("Error while creating features DataFrame:", e)

# Check the combined features
print(features.head())
print(features.dtypes)

Features DataFrame created successfully.
   Severity  Location
0  0.531053  Sector A
1 -0.517768  Sector A
2  0.880659  Sector A
3 -0.867374  Sector D
4  0.880659  Sector A
Severity    float64
Location     object
dtype: object


In [67]:
# Check the data types of features
print(features.dtypes)

Severity    float64
Location     object
dtype: object


In [69]:
# Ensure features are properly combined
try:
    features = pd.concat([
        historical_features[['Severity (1-10)', 'Location']].rename(columns={'Severity (1-10)': 'Severity'}),
        miner_features[['Severity', 'Location']],
        equipment_features[['Severity', 'Risk Level']]
    ], axis=0).reset_index(drop=True)

    print("Features DataFrame created successfully.")
except Exception as e:
    print("Error while creating features DataFrame:", e)

# Check the data types of features
if 'features' in locals():
    print(features.dtypes)
else:
    print("Features DataFrame is not defined.")

Error while creating features DataFrame: "['Severity (1-10)'] not in index"
Severity    float64
Location     object
dtype: object


In [71]:
# Step 1: Check columns in `equipment_features` to verify if 'Risk Level' exists
if 'Risk Level' not in equipment_features.columns:
    print("Error: 'Risk Level' column not found in equipment_features.")
    print("Columns available in equipment_features:", equipment_features.columns)
    
    # Step 2: If 'Risk Level' is missing, you may either rename or add a placeholder column
    # Option 1: Rename if there is a different column name or typo
    # equipment_features = equipment_features.rename(columns={'incorrect_column_name': 'Risk Level'})
    
    # Option 2: Add a placeholder if 'Risk Level' should exist but is missing
    equipment_features['Risk Level'] = 'Unknown'  # or some default value

# Step 3: Verify column names after possible renaming or adding the 'Risk Level' column
print("Columns after handling 'Risk Level':", equipment_features.columns)

# Step 4: Concatenate the DataFrames (historical_features, miner_features, and equipment_features)
features = pd.concat([
    historical_features[['Severity', 'Location']].rename(columns={'Severity (1-10)': 'Severity'}),
    miner_features[['Severity', 'Location']],
    equipment_features[['Severity', 'Risk Level']]  # Ensure 'Risk Level' exists
], axis=0).reset_index(drop=True)

# Step 5: Check the concatenated DataFrame
print(features.head())

# Step 6: Optionally, inspect for any missing values or inconsistencies
print(features.isnull().sum())


Error: 'Risk Level' column not found in equipment_features.
Columns available in equipment_features: Index(['Severity', 'Location'], dtype='object')
Columns after handling 'Risk Level': Index(['Severity', 'Location', 'Risk Level'], dtype='object')
   Severity  Location Risk Level
0  0.531053  Sector A        NaN
1 -0.517768  Sector A        NaN
2  0.880659  Sector A        NaN
3 -0.867374  Sector D        NaN
4  0.880659  Sector A        NaN
Severity         0
Location      1000
Risk Level    2000
dtype: int64


In [73]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the 'Risk Level' column
features['Risk Level'] = label_encoder.fit_transform(features['Risk Level'])

# Encode the 'Location' column
features['Location'] = label_encoder.fit_transform(features['Location'])

# Verify the output
print("Encoded 'Risk Level' and 'Location':\n", features[['Risk Level', 'Location']].head())


Encoded 'Risk Level' and 'Location':
    Risk Level  Location
0           1         0
1           1         0
2           1         0
3           1         3
4           1         0


In [75]:
print(features.dtypes)

Severity      float64
Location        int64
Risk Level      int64
dtype: object


In [77]:
print("Features DataFrame Columns:", features.columns)

Features DataFrame Columns: Index(['Severity', 'Location', 'Risk Level'], dtype='object')


In [79]:
import pandas as pd

# Load datasets
historical_data = pd.read_csv('/Users/deepu/Desktop/Mini_Project/Datasets/historical_data.csv')
equipment_failure = pd.read_csv('/Users/deepu/Desktop/Mini_Project/Datasets/equipment_failure.csv')
minor_hazard_rep = pd.read_csv('/Users/deepu/Desktop/Mini_Project/Datasets/minor_hazard_rep.csv')


# Combine data (example approach, replace with your actual logic)
combined_data = pd.concat([equipment_failure, historical_data, minor_hazard_rep], axis=0)


In [81]:
print(combined_data.head())
print(combined_data.columns)

  Equipment ID Failure Date                Type Maintenance Date  \
0       E-0001   2022-08-06       Conveyor Belt       2023-07-31   
1       E-0002   2023-03-14         Pump System       2023-01-31   
2       E-0003   2024-06-03  Ventilation System       2023-05-04   
3       E-0004   2024-04-24    Drilling Machine       2021-12-02   
4       E-0005   2023-02-13  Ventilation System       2021-07-03   

      Failure Cause Risk Level  Severity Incident ID Date Location  \
0      Improper Use       High         8         NaN  NaN      NaN   
1       Wear & Tear       High         8         NaN  NaN      NaN   
2  Electrical Fault     Medium         6         NaN  NaN      NaN   
3  Electrical Fault        Low         3         NaN  NaN      NaN   
4           Unknown       High         8         NaN  NaN      NaN   

  Hazard Type Cause Equipment Type Recommended Action Outcome Report ID  \
0         NaN   NaN            NaN                NaN     NaN       NaN   
1         NaN   NaN 

In [83]:
from sklearn.model_selection import train_test_split

# Step 1: Check the dataset columns
print("Dataset Columns:", combined_data.columns)

# Step 2: Extract features and targets
# Replace 'Hazard' with 'Hazard Type' based on your dataset structure
hazard_target = combined_data['Hazard Type']  # Target variable for hazard prediction
cause_target = combined_data['Cause']         # Target variable for cause prediction
features = combined_data[['Risk Level', 'Location']]  # Input feature columns

# Step 3: Verify shapes
print("Features shape:", features.shape)
print("Hazard Target shape:", hazard_target.shape)
print("Cause Target shape:", cause_target.shape)

# Step 4: Train-test split
# Split for hazard prediction
X_train_hazard, X_test_hazard, y_train_hazard, y_test_hazard = train_test_split(
    features, hazard_target, test_size=0.2, random_state=42
)

# Split for cause prediction
X_train_cause, X_test_cause, y_train_cause, y_test_cause = train_test_split(
    features, cause_target, test_size=0.2, random_state=42
)

# Step 5: Verify split sizes
print("X_train_hazard shape:", X_train_hazard.shape)
print("X_test_hazard shape:", X_test_hazard.shape)
print("y_train_hazard shape:", y_train_hazard.shape)
print("y_test_hazard shape:", y_test_hazard.shape)

print("X_train_cause shape:", X_train_cause.shape)
print("X_test_cause shape:", X_test_cause.shape)
print("y_train_cause shape:", y_train_cause.shape)
print("y_test_cause shape:", y_test_cause.shape)


Dataset Columns: Index(['Equipment ID', 'Failure Date', 'Type', 'Maintenance Date',
       'Failure Cause', 'Risk Level', 'Severity', 'Incident ID', 'Date',
       'Location', 'Hazard Type', 'Cause', 'Equipment Type',
       'Recommended Action', 'Outcome', 'Report ID', 'Timestamp', 'Miner ID',
       'Description'],
      dtype='object')
Features shape: (3000, 2)
Hazard Target shape: (3000,)
Cause Target shape: (3000,)
X_train_hazard shape: (2400, 2)
X_test_hazard shape: (600, 2)
y_train_hazard shape: (2400,)
y_test_hazard shape: (600,)
X_train_cause shape: (2400, 2)
X_test_cause shape: (600, 2)
y_train_cause shape: (2400,)
y_test_cause shape: (600,)


In [85]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest models for both tasks
model_hazard = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model_cause = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')


In [87]:
# Check unique values in 'Risk Level' and 'Location' to understand the mix of data types
print("Unique values in 'Risk Level':", features['Risk Level'].unique())
print("Unique values in 'Location':", features['Location'].unique())

Unique values in 'Risk Level': ['High' 'Medium' 'Low' 'Critical']
Unique values in 'Location': [nan 'Sector A' 'Sector D' 'Sector C' 'Sector B']


In [89]:

print("NaN in features:")
print(features.isnull().sum())  # Check features for NaN

print("NaN in hazard_target:")
print(hazard_target.isnull().sum())  # Check hazard_target for NaN

print("NaN in cause_target:")
print(cause_target.isnull().sum())  # Check cause_target for NaN


NaN in features:
Risk Level       0
Location      1000
dtype: int64
NaN in hazard_target:
1000
NaN in cause_target:
1000


In [91]:
# Remove rows with NaN in hazard_target
mask_hazard = ~hazard_target.isnull()
features_hazard = features[mask_hazard]
hazard_target_cleaned = hazard_target[mask_hazard]

# Remove rows with NaN in cause_target
mask_cause = ~cause_target.isnull()
features_cause = features[mask_cause]
cause_target_cleaned = cause_target[mask_cause]

# Ensure no NaN values remain
print("NaN in hazard_target_cleaned:", hazard_target_cleaned.isnull().sum())
print("NaN in cause_target_cleaned:", cause_target_cleaned.isnull().sum())

NaN in hazard_target_cleaned: 0
NaN in cause_target_cleaned: 0


In [93]:
import joblib

# Assuming model_hazard and model_cause are your trained models
  # Save hazard prediction model
joblib.dump(model_cause, 'cause_prediction_model.pkl')    # Save cause prediction model


['cause_prediction_model.pkl']

In [95]:
joblib.dump(model_hazard, 'hazard_prediction_model.pkl')

['hazard_prediction_model.pkl']

In [97]:
joblib.dump(model_hazard, '/Users/deepu/Desktop/Mini_Project/hazard_prediction_model.pkl')
joblib.dump(model_cause, '/Users/deepu/Desktop/Mini_Project/cause_prediction_model.pkl')

print("Models trained and saved successfully!")

Models trained and saved successfully!


In [99]:
import pandas as pd
import numpy as np

# Example DataFrame (replace with your dataset)
data = {
    'Risk Level': ['High', 'High', 'Medium', 'Low', 'High'],
    'Location': [np.nan, np.nan, np.nan, np.nan, np.nan]  # Simulating missing values
}

# Convert to DataFrame
features = pd.DataFrame(data)

# Define the categorical columns
categorical_columns = ['Risk Level', 'Location']

# Check for missing values in these columns
print("Missing values per column:")
print(features[categorical_columns].isnull().sum())

# Handle missing values in 'Location'
if 'Location' in features.columns:
    # Convert 'Location' column to string to prevent dtype mismatch warnings
    features['Location'] = features['Location'].astype(str)

    # Identify rows with missing values (originally NaN, now 'nan')
    missing_indices = features['Location'] == 'nan'

    # Count missing values
    missing_count = missing_indices.sum()

    if missing_count > 0:
        # Generate alphabetic sector labels
        sector_replacements = [f"Sector {chr(65 + i)}" for i in range(missing_count)]

        # Replace missing values dynamically
        features.loc[missing_indices, 'Location'] = sector_replacements

    print("Updated 'Location' Column:")
    print(features['Location'].head())
else:
    print("'Location' column not found in the dataset!")

# Display the first few rows of categorical columns
print("Categorical Columns Data:")
print(features[categorical_columns].head())


Missing values per column:
Risk Level    0
Location      5
dtype: int64
Updated 'Location' Column:
0    Sector A
1    Sector B
2    Sector C
3    Sector D
4    Sector E
Name: Location, dtype: object
Categorical Columns Data:
  Risk Level  Location
0       High  Sector A
1       High  Sector B
2     Medium  Sector C
3        Low  Sector D
4       High  Sector E
