In [9]:
import pandas as pd
df = pd.read_csv("Honda Engine Dataset.csv")

# Define columns
input_cols = [
    "Product Type", "Shaft Rotation", "Fuel Type",
    "Charging System", "Engine Type",'Engine Displacement (cc)','Maximum Power (HP)'

]
output_class = "Series"
output_regs = [
    "Bore (mm)", "Stroke (mm)",
    "Oil Capacity (L)", "Cylinders (qty.)",
    "Dry Weight (kg)", "Shaft Diameter (mm)",
]

X = df[input_cols]
y_class = df[output_class]
y_regs = df[output_regs] 


In [10]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
# Encode target for classification
le = LabelEncoder()
y_class_encoded = le.fit_transform(y_class)
# Preprocessing
categorical_cols = X.select_dtypes(include="object").columns.tolist()
preprocessor = ColumnTransformer([
    ("onehot", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
], remainder="passthrough")


In [11]:
from sklearn.model_selection import train_test_split
# Train-test split
X_train, X_test, y_train_cls, y_test_cls, y_train_regs, y_test_regs = train_test_split(X, y_class_encoded, y_regs, test_size=0.2, random_state=42)

In [12]:
#classifier mechanism for series
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
classifiers = {
    "Random Forest": RandomForestClassifier(n_estimators=50, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=50, use_label_encoder=False, eval_metric="mlogloss"),
    "AdaBoost": AdaBoostClassifier(n_estimators=50),
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boost": GradientBoostingClassifier(n_estimators=50)
}



In [13]:
best_cls_name = None
best_cls_score = 0
best_cls_pipeline = None

for name, clf in classifiers.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("classifier", clf)
    ])
    pipe.fit(X_train, y_train_cls)
    preds = pipe.predict(X_test)
    acc = accuracy_score(y_test_cls, preds)
    print(f"{name} Accuracy: {acc * 100:.2f}%")
    if acc > best_cls_score:
        best_cls_score = acc
        best_cls_name = name
        best_cls_pipeline = pipe
import joblib
joblib.dump(best_cls_pipeline, "series_classifier_model.joblib")
joblib.dump(le, "series_label_encoder.joblib")
print(f"\nBest Classifier: {best_cls_name}")

Random Forest Accuracy: 100.00%


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 100.00%
AdaBoost Accuracy: 78.27%


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 100.00%
Decision Tree Accuracy: 100.00%
Gradient Boost Accuracy: 100.00%

Best Classifier: Random Forest


In [14]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score
regressors = {
    "Ridge": Ridge(),
    "Lasso": Lasso(),
    "Random Forest": RandomForestRegressor(n_estimators=50, random_state=42),
    "AdaBoost": AdaBoostRegressor(n_estimators=50, random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
     "XGBoost": XGBRegressor(n_estimators=50, random_state=42)
}

In [15]:
best_reg_name = None
best_reg_score = 0
best_reg_pipeline = None
for name, model in regressors.items():
    pipe = Pipeline([
        ("preprocess", preprocessor),
        ("regressor", MultiOutputRegressor(model))
    ])
    pipe.fit(X_train, y_train_regs)
    pred_regs = pipe.predict(X_test)
    r2 = r2_score(y_test_regs, pred_regs, multioutput="uniform_average")
    print(f"{name} R² Score: {r2 * 100:.2f}%")
    if r2 > best_reg_score:
        best_reg_score = r2
        best_reg_name = name
        best_reg_pipeline = pipe

joblib.dump(best_reg_pipeline, "best_regressor_model.joblib")
print(f"\nBest Regressor: {best_reg_name}")


Ridge R² Score: -0.04%
Lasso R² Score: -0.06%
Random Forest R² Score: 88.80%
AdaBoost R² Score: 0.19%
Decision Tree R² Score: 91.98%
XGBoost R² Score: 34.22%

Best Regressor: Decision Tree


In [16]:

import pandas as pd
import joblib
from sklearn.metrics.pairwise import euclidean_distances
regressor = joblib.load("best_regressor_model.joblib")
classifier = joblib.load("series_classifier_model.joblib")
label_enc = joblib.load("series_label_encoder.joblib")
df = pd.read_csv("Honda Engine Dataset.csv")

#test 
sample_input = pd.DataFrame([{
    "Product Type": "Lawn Mowers",
    "Shaft Rotation": "Vertical",
    "Fuel Type": "Petrol",
    "Charging System": "12V DC",
    "Engine Type": "OHV",
    "Maximum Power (HP)": 5,
    "Engine Displacement (cc)": 97
}])

input_cols = [
    "Product Type", "Shaft Rotation", "Fuel Type",
    "Charging System", "Engine Type",
    "Maximum Power (HP)", "Engine Displacement (cc)"
]
output_regs = [
    "Bore (mm)", "Stroke (mm)", "Oil Capacity (L)",
    "Cylinders (qty.)", "Dry Weight (kg)", "Shaft Diameter (mm)"
]
spec_cols = output_regs.copy()
# hard filters on categorical fields
filtered = df.copy()
for col in ["Product Type", "Shaft Rotation", "Fuel Type", "Charging System", "Engine Type"]:
    val = sample_input.at[0, col]
    filtered = filtered[filtered[col] == val]

print("Available categorical options in dataset:")
for col in ["Product Type", "Shaft Rotation", "Fuel Type", "Charging System", "Engine Type"]:
    print(f"{col}: {df[col].dropna().unique().tolist()}")

print(f"\nRows after hard categorical filters: {len(filtered)}")
# Recommendation if Engine remain
if filtered.empty:
    print(" No engine matches the hard filters.")
else:
    # Predict specifications
    predicted_specs = regressor.predict(sample_input[input_cols])[0]

    # Compute distances on the 6 predicted numeric outputs
    distances = euclidean_distances(filtered[spec_cols], [predicted_specs])
    filtered["Distance"] = distances

    # Recommend top 3 engines
    top3 = filtered.sort_values("Distance").head(3).reset_index(drop=True)

    # Predict Series
    predicted_series = classifier.predict(sample_input[input_cols])[0]
    series_name = label_enc.inverse_transform([predicted_series])[0]

    # Display result
    print(f"\nPredicted Series: {series_name}")
    print("\nTop 3 Engine Recommendations:\n")
    display_cols = input_cols + ["Series"] + output_regs
    print(top3[display_cols])




Available categorical options in dataset:
Product Type: ['Tractors', 'Tillers/Cultivators', 'Lawn Mowers', 'Commercial Lawn and Garden']
Shaft Rotation: ['Vertical', 'Horizontal']
Fuel Type: ['Petrol', 'Diesel']
Charging System: ['12V DC', 'Alternator']
Engine Type: ['OHV', '4-stroke', 'V-twin', '2-stroke']

Rows after hard categorical filters: 45

Predicted Series: GX Commercial

Top 3 Engine Recommendations:

  Product Type Shaft Rotation Fuel Type Charging System Engine Type  \
0  Lawn Mowers       Vertical    Petrol          12V DC         OHV   
1  Lawn Mowers       Vertical    Petrol          12V DC         OHV   
2  Lawn Mowers       Vertical    Petrol          12V DC         OHV   

   Maximum Power (HP)  Engine Displacement (cc)         Series  Bore (mm)  \
0                 5.0                        97  GX Commercial     57.523   
1                 5.0                        97  GX Commercial     57.538   
2                 5.0                        97  GX Commercial     57