In [6]:
import csv
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest, f_classif
from collections import Counter

# -----------------------------
# Dynamic dataset loader
# -----------------------------
def load_dataset_auto_sep(path):
    with open(path, 'r', encoding='utf-8') as f:
        sample = f.read(2048)
        f.seek(0)
        dialect = csv.Sniffer().sniff(sample)
        sep = dialect.delimiter
        print(f"ℹ️ Detected separator: '{sep}'")
    return pd.read_csv(path, sep=sep)

# -----------------------------
# Load dataset
# -----------------------------
data_path = 'Datasets/car_sales_data.csv'  # Change this dynamically if needed
df = load_dataset_auto_sep(data_path)
df = df.fillna(np.nan)

print(f"✅ Loaded dataset shape: {df.shape}")
print(f"✅ Columns: {list(df.columns)}")

# -----------------------------
# Automatically set features and target
# -----------------------------
X = df.iloc[:, :-1].copy()
y = df.iloc[:, -1].copy()
print(f"\n✅ Selected Features (X): {list(X.columns)}")
print(f"✅ Selected Target (y): {y.name}")

# -----------------------------
# Safely drop likely ID columns
# -----------------------------
id_cols = [col for col in X.columns if X[col].nunique() == len(X) and X[col].dtype in [np.int64, object]]
if id_cols:
    remaining_cols = [col for col in X.columns if col not in id_cols]
    if len(remaining_cols) == 0:
        print(f"⚠️ Skipping drop: Dropping {id_cols} would leave no features.")
    else:
        print(f"🚨 Dropping likely ID columns: {id_cols}")
        X = X.drop(columns=id_cols)

# -----------------------------
# Drop leakage columns (Regression only)
# -----------------------------
# if y.dtype in [np.float64, np.int64]:
#     corr = X.corrwith(y).abs()
#     leak_cols = corr[corr > 0.98].index.tolist()
#     remaining_cols = [col for col in X.columns if col not in leak_cols]
#     if leak_cols:
#         if len(remaining_cols) == 0:
#             print(f"⚠️ Skipping drop: Dropping {leak_cols} would leave no features.")
#         else:
#             print(f"🚨 Dropping leakage columns: {leak_cols}")
#             X = X.drop(columns=leak_cols)

# -----------------------------
# Check for empty feature set
# -----------------------------
if X.shape[1] == 0:
    print("❌ No features left after preprocessing steps. Restoring all columns except the target.")
    X = df.iloc[:, :-1].copy()

# Dynamically split numeric pair columns
for col in X.select_dtypes(include='object').columns:
    if X[col].str.match(r'^\d+/\d+$').all():
        print(f"🔄 Splitting numeric pair column: {col}")
        split_df = X[col].str.split('/', expand=True)
        X[f"{col}_1"] = pd.to_numeric(split_df[0], errors='coerce')
        X[f"{col}_2"] = pd.to_numeric(split_df[1], errors='coerce')
        X = X.drop(columns=[col])

if X.shape[1] == 0:
    print("❌ No features left after numeric pair splitting. Restoring all columns except the target.")
    X = df.iloc[:, :-1].copy()

# -----------------------------
# Detect numeric and categorical columns
# -----------------------------
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X.select_dtypes(include=[object, 'category', 'bool']).columns.tolist()

# -----------------------------
# Check target distribution
# -----------------------------
print("\n🔍 Target Class Distribution:")
print(y.value_counts(normalize=True))

# -----------------------------
# Detect high-cardinality categorical columns
# -----------------------------
high_card_cols = [col for col in cat_cols if X[col].nunique() > 50]
low_card_cols = [col for col in cat_cols if col not in high_card_cols]
if high_card_cols:
    print(f"⚡ High cardinality columns detected: {high_card_cols}. Using OrdinalEncoder for these.")

# -----------------------------
# Dynamic scaling choice
# -----------------------------
scaler_choice = 'StandardScaler' if (num_cols and X[num_cols].var().max() > 1) else 'MinMaxScaler'

# -----------------------------
# Train/test split
# -----------------------------
stratify_arg = y if (y.nunique() <= 20 and y.dtype != float) else None
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=stratify_arg
)

if len(X.columns) == 0:
    raise ValueError("❌ No features available after preprocessing steps.")

print("✅ Features available for modeling:", list(X.columns))

# -----------------------------
# Build preprocessing pipeline
# -----------------------------
transformers = []
if num_cols:
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler() if scaler_choice == 'StandardScaler' else MinMaxScaler())
    ])
    transformers.append(('num', num_pipeline, num_cols))

if low_card_cols:
    cat_low_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=True))  # Sparse for memory savings
    ])
    transformers.append(('cat_low', cat_low_pipeline, low_card_cols))

if high_card_cols:
    cat_high_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OrdinalEncoder())
    ])
    transformers.append(('cat_high', cat_high_pipeline, high_card_cols))

preprocessor = ColumnTransformer(transformers, remainder='drop')

# -----------------------------
# Fit and transform
# -----------------------------
X_train_proc = preprocessor.fit_transform(X_train)
X_test_proc = preprocessor.transform(X_test)

# Get feature names for the processed data
feature_names = []
if num_cols:
    feature_names.extend(num_cols)
if low_card_cols:
    # Get feature names from OneHotEncoder
    encoder = preprocessor.named_transformers_['cat_low'].named_steps['encoder']
    feature_names.extend(encoder.get_feature_names_out(low_card_cols))
if high_card_cols:
    feature_names.extend(high_card_cols)

# Convert to numpy array for modeling
X_train_proc = np.asarray(X_train_proc)
X_test_proc = np.asarray(X_test_proc)

# Apply feature selection only if feature space is large
if X_train_proc.shape[1] > 100:
    feature_selector = SelectKBest(score_func=f_classif, k=20)
    X_train_proc = feature_selector.fit_transform(X_train_proc, y_train)
    X_test_proc = feature_selector.transform(X_test_proc)
    print("Applied feature selection: Top 20 features kept.")
else:
    print("Feature selection skipped (small feature space).")

print("\nProcessed Features Sample:")
display(pd.DataFrame(X_train_proc, columns=feature_names).head())

# -----------------------------
# Save processed datasets
# -----------------------------
os.makedirs('output', exist_ok=True)
train_df = pd.DataFrame(X_train_proc, columns=feature_names)
train_df[y.name] = y_train.reset_index(drop=True)
train_df.to_csv('output/train_processed.csv', index=False)

test_df = pd.DataFrame(X_test_proc, columns=feature_names)
test_df[y.name] = y_test.reset_index(drop=True)
test_df.to_csv('output/test_processed.csv', index=False)

print("\n✅ Processed train/test datasets saved to /content/ (no automatic download).")

# -----------------------------
# Model selection & training
# -----------------------------
models_classification = {
    "RandomForestClassifier": RandomForestClassifier(
        n_estimators=100, max_depth=5, class_weight='balanced', random_state=42),
    "GradientBoostingClassifier": GradientBoostingClassifier(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    "LogisticRegression": LogisticRegression(
        max_iter=1000, solver='liblinear', random_state=42)
}

models_regression = {
    "RandomForestRegressor": RandomForestRegressor(
        n_estimators=100, max_depth=5, random_state=42),
    "GradientBoostingRegressor": GradientBoostingRegressor(
        n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
    "LinearRegression": LinearRegression()
}

task_type = 'classification' if y.dtype == object or (y.nunique() <= 20 and y.dtype != float) else 'regression'
models_to_train = models_classification if task_type == 'classification' else models_regression
print(f"\nDetected model type: {task_type}")
print("Training all models to recommend the best one...")

cv_strategy = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) if task_type == 'classification' else 3

performance = {}
for name, model in models_to_train.items():
    scores = cross_val_score(model, X_train_proc, y_train, cv=cv_strategy,
                             scoring='accuracy' if task_type == 'classification' else 'r2')
    performance[name] = scores.mean()

best_model_name = max(performance, key=performance.get)
print("\nModel performance comparison:")
for k, v in performance.items():
    print(f"{k}: {v:.4f}")

print(f"\n✅ Recommended model: {best_model_name} (best balance of speed and accuracy)")


ℹ️ Detected separator: ','
✅ Loaded dataset shape: (50000, 7)
✅ Columns: ['Manufacturer', 'Model', 'Engine size', 'Fuel type', 'Year of manufacture', 'Mileage', 'Price']

✅ Selected Features (X): ['Manufacturer', 'Model', 'Engine size', 'Fuel type', 'Year of manufacture', 'Mileage']
✅ Selected Target (y): Price

🔍 Target Class Distribution:
Price
1610     0.00028
805      0.00026
1384     0.00026
2033     0.00024
1559     0.00024
          ...   
36387    0.00002
8197     0.00002
35058    0.00002
8182     0.00002
31112    0.00002
Name: proportion, Length: 25045, dtype: float64
✅ Features available for modeling: ['Manufacturer', 'Model', 'Engine size', 'Fuel type', 'Year of manufacture', 'Mileage']
✅ Features available for modeling: ['Manufacturer', 'Model', 'Engine size', 'Fuel type', 'Year of manufacture', 'Mileage']


IndexError: tuple index out of range