In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV

In [15]:

# Identify common columns between the two dataframes
common_columns = installer_df.columns.intersection(involver_df.columns).tolist()

# Attempt to merge using a subset of key columns
key_columns = ['Site', 'Vessel_Name', 'Wo_No']

# Merge the dataframes using the selected key columns
merged_df = pd.merge(installer_df, involver_df, on=key_columns, how='inner')

# Select the columns for features and target
feature_columns = ['Group', 'Symptom', 'Error_Cause', 'Cause_Details', 'Error_Class', 'Discovery', 'Completion_Note', 'Action_Taken', 'Work_Description', 'Directive']
target_column = 'EBS1'


In [16]:

# Filter the dataframe for the selected columns
installer_df_filtered = installer_df[feature_columns + [target_column]].dropna()

# Encode categorical variables
label_encoders = {}
for column in feature_columns:
    le = LabelEncoder()
    installer_df_filtered[column] = le.fit_transform(installer_df_filtered[column].astype(str))
    label_encoders[column] = le

    

In [17]:

# Encode the target column
target_le = LabelEncoder()
installer_df_filtered[target_column] = target_le.fit_transform(installer_df_filtered[target_column].astype(str))

# Split the data into training and testing sets
X = installer_df_filtered[feature_columns]
y = installer_df_filtered[target_column]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Define the parameter grid for GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

In [18]:

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and the best model
best_params = grid_search.best_params_
best_rf_model = grid_search.best_estimator_

# Make predictions with the best model
best_y_pred = best_rf_model.predict(X_test)

# Define common_classes based on unique classes in y_train and y_test
common_classes = list(set(y_train.unique()).intersection(set(y_test.unique())))
common_classes.sort()  # Ensure the classes are sorted for consistency


Fitting 3 folds for each of 216 candidates, totalling 648 fits




In [19]:

# Evaluate the best model with the filtered common classes
best_accuracy = accuracy_score(y_test, best_y_pred)
best_report = classification_report(y_test, best_y_pred, labels=common_classes, target_names=target_le.inverse_transform(common_classes))

print("Best Parameters:", best_params)
print("Best Accuracy:", best_accuracy)
print("Classification Report:\n", best_report)

Best Parameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best Accuracy: 0.6617647058823529
Classification Report:
                                                      precision    recall  f1-score   support

                                 Cargo Gear System        0.73      1.00      0.84         8
                                    Cooling System        0.60      0.67      0.63         9
                        Electric Propulsion System        1.00      0.91      0.95        11
               Emergency Power Distribution System        1.00      0.67      0.80         3
               Fire Detection and Fire Alarm System       0.00      0.00      0.00         1
                                  Fire Main System        1.00      0.50      0.67         2
                                 Fresh Water System       0.00      0.00      0.00         2
                                Fresh Water System        0.73      1.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
