In [1]:
import pandas as pd
import numpy as np
import pickle
import re
from tensorflow import keras
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix

In [5]:
import pickle
from tensorflow import keras

# Load model
model = keras.models.load_model(r"C:\Users\ASUS\Desktop\cars_transmission\notebooks\final_rmsprop_model.keras")

# Load transformers
with open(r"C:\Users\ASUS\Desktop\cars_transmission\notebooks\preprocessing_pipeline.pkl", "rb") as f:
    transformers = pickle.load(f)

# Check keys
print("Available keys in loaded transformers:", transformers.keys())

# Correctly access them
encoders = transformers["encoders"]
imputer = transformers["imputer"]
scaler = transformers["scaler"]

# Verify types
print("Imputer type:", type(imputer))
print("Scaler type:", type(scaler))
print("Encoders type:", type(encoders))


Available keys in loaded transformers: dict_keys(['encoders', 'imputer', 'scaler'])
Imputer type: <class 'sklearn.compose._column_transformer.ColumnTransformer'>
Scaler type: <class 'sklearn.compose._column_transformer.ColumnTransformer'>
Encoders type: <class 'dict'>


In [6]:

# 2. Load test data
unseen_df = pd.read_csv("C:\\Users\\ASUS\\Desktop\\cars_transmission\\notebooks\\cars_unseen_data.csv")  # Ensure this file is in the same directory as the notebook or adjust the path

# 3. Clean and prepare target
unseen_df['transmissionType'] = unseen_df['transmissionType'].str.strip().str.lower()
unseen_df['transmissionType'] = unseen_df['transmissionType'].map({'manual': 0, 'automatic': 1})
unseen_df = unseen_df.dropna(subset=['transmissionType'])
X_test_raw = unseen_df.copy()
y_test = X_test_raw.pop('transmissionType')

# 4. Feature selection
def wanted_features(df):
    features = [
        'fuel_type_new', 'No of Cylinder', 'Displacement', 'Max Torque', 'Gear Box',
        'Drive Type', 'Seating Capacity', 'bt', 'dynx_totalvalue_x', 'model_year_new',
        'engine_cc', 'Top Speed', 'Max Power', 'mileage_new', 'km'
    ]
    return df[features]

X_test = wanted_features(X_test_raw)

# 5. Feature engineering
def feature_engineering(df):
    df = df.copy()
    df['km'] = df['km'].astype(str).str.replace(',', '').str.extract(r'(\d+)')[0].astype(float)
    df['mileage_new'] = df['mileage_new'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    df['Max Power'] = df['Max Power'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    df['Top Speed'] = df['Top Speed'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)
    df['Max Torque'] = df['Max Torque'].astype(str).str.extract(r'(\d+\.?\d*)')[0].astype(float)

    def parse_engine(cc_str):
        nums = re.findall(r'(\d+)', str(cc_str))
        return (int(nums[0]) + int(nums[1])) / 2 if len(nums) == 2 else int(nums[0]) if nums else np.nan
    df['engine_cc'] = df['engine_cc'].apply(parse_engine)

    df.loc[df['fuel_type_new'].str.lower() == 'electric', 'No of Cylinder'] = 0
    df['No of Cylinder'] = pd.to_numeric(df['No of Cylinder'], errors='coerce').fillna(0).astype(int)
    df['Seating Capacity'] = pd.to_numeric(df['Seating Capacity'], errors='coerce').fillna(0).astype(int)
    return df

X_test_fe = feature_engineering(X_test)

# 6. Cleaning
def clean_impute(df):
    df = df.copy()
    df['Gear Box'] = df['Gear Box'].astype(str).str.extract(r'(\d+)')[0].fillna('Unknown')
    def normalize_drive_type(dt):
        dt = str(dt).strip().lower()
        if 'fwd' in dt or 'front' in dt:
            return 'FWD'
        elif 'rwd' in dt or 'rear' in dt:
            return 'RWD'
        elif 'awd' in dt or '4wd' in dt or 'all' in dt:
            return 'AWD'
        return 'Unknown'
    df['Drive Type'] = df['Drive Type'].apply(normalize_drive_type).fillna('Unknown')
    df['bt'] = df['bt'].fillna('Unknown')

    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].median())
    return df

X_test_cleaned = clean_impute(X_test_fe)

# 7. One-hot encoding support
def one_hot_encoding(data, column, transformer_one_hot_encoder=None):
    encoded = transformer_one_hot_encoder.transform(data[[column]])
    encoded_df = pd.DataFrame(encoded, columns=transformer_one_hot_encoder.get_feature_names_out([column]), index=data.index)
    data = data.drop(columns=[column])
    return pd.concat([data, encoded_df], axis=1)

# 8. Final preprocessing
def preprocess_test_data(df, encoders, imputer, scaler):
    cat_cols = ['fuel_type_new', 'Gear Box', 'Drive Type', 'bt']
    num_cols = [col for col in df.columns if col not in cat_cols]

    df = imputer.transform(df)
    df = pd.DataFrame(df, columns=num_cols + cat_cols)

    df = scaler.transform(df)
    df = pd.DataFrame(df, columns=num_cols + cat_cols)

    for col in cat_cols:
        df = one_hot_encoding(df, col, transformer_one_hot_encoder=encoders[col])
    return df

X_test_final = preprocess_test_data(X_test_cleaned, encoders, imputer, scaler)
X_test_final = X_test_final.astype('float64')

# 9. Predict and evaluate
y_pred_probs = model.predict(X_test_final)
y_pred = (y_pred_probs >= 0.5).astype(int)

# 10. Evaluation
print("\n📋 Classification Report on Unseen Data:\n")
print(classification_report(y_test, y_pred, target_names=['Manual', 'Automatic']))

# Optional: Save predictions
df_output = pd.DataFrame({"Actual": y_test, "Predicted": y_pred.flatten()})
df_output.to_csv("unseen_predictions.csv", index=False)


[1m119/119[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step

📋 Classification Report on Unseen Data:

              precision    recall  f1-score   support

      Manual       0.92      0.97      0.94      2861
   Automatic       0.89      0.72      0.80       920

    accuracy                           0.91      3781
   macro avg       0.90      0.85      0.87      3781
weighted avg       0.91      0.91      0.91      3781

