In [19]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
import joblib


In [12]:
def load_data(filepath: str) -> pd.DataFrame:
    """Loading and returning the raw CSV data."""
    return pd.read_csv(filepath, encoding='latin1')

def inspect_data(df: pd.DataFrame):
    """Printing head(), info(), and describe()."""
    print(df.head(), "\n")
    df.info(); print("\n")
    print(df.describe(), "\n")

def plot_eda(df: pd.DataFrame):
    """Simple EDA: scatter and heatmap for key features."""
    sns.pairplot(df, vars=['annual Salary','credit card debt','net worth'],
                 y_vars=['car purchase amount'])
    plt.suptitle("Feature vs. Target")
    plt.show()
    plt.figure(figsize=(6,5))
    sns.heatmap(df.corr(), annot=True)
    plt.title("Correlation Matrix")
    plt.show()

def preprocess_and_engineer(df: pd.DataFrame) -> pd.DataFrame:
    """Impute, scale, and add polynomial & categorical features."""
    # Column definitions
    num_cols = ['age','annual Salary','credit card debt','net worth']
    cat_cols = ['country','gender']

    # Building transformer
    preprocessor = ColumnTransformer([
        ('num', Pipeline([
            ('impute', SimpleImputer(strategy='median')),
            ('scale', StandardScaler()),
            ('poly', PolynomialFeatures(degree=2, include_bias=False))
        ]), num_cols),
        ('cat', Pipeline([
            ('impute', SimpleImputer(strategy='most_frequent')),
            ('onehot',
             # handling unknowns gracefully
             # drop='first' to avoid multicollinearity
             SimpleImputer(strategy='most_frequent')

            )
        ]), cat_cols)
    ], remainder='drop')

    features = preprocessor.fit_transform(df)
    return features

def train_and_evaluate(X, y):
    """Split data, train a stacking regressor, and print metrics."""
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    # Ensemble of Linear + Gradient Boosting
    stack = StackingRegressor(
        estimators=[('lr', LinearRegression()),
                    ('gb', GradientBoostingRegressor(random_state=42))],
        final_estimator=GradientBoostingRegressor(n_estimators=50, random_state=42),
        cv=5
    )
    pipeline = Pipeline([('model', stack)])

    # Hyperparameter tuning
    param_dist = {
        'model__final_estimator__learning_rate': [0.01, 0.1],
        'model__final_estimator__n_estimators': [100, 200]
    }
    search = RandomizedSearchCV(pipeline, param_dist, n_iter=4, cv=3,
                                scoring='neg_root_mean_squared_error',
                                random_state=42)
    search.fit(X_train, y_train)

    best = search.best_estimator_
    preds = best.predict(X_test)
    rmse = mean_squared_error(y_test, preds, squared=False)
    r2   = r2_score(y_test, preds)
    print(f"Test RMSE: {rmse:.2f}, R²: {r2:.3f}")

    # Saving the model
    joblib.dump(best, "sales_model.pkl")


In [20]:
def main():
    # 1. Loading
    df = pd.read_csv("car_purchasing.csv", encoding='latin1')

    # 2. Preparing X, y
    # — drop name/email, one-hot encode country
    df = pd.get_dummies(df, columns=['country'], drop_first=True)
    feature_cols = [c for c in df.columns
                    if c not in ['customer name', 'customer e-mail', 'car purchase amount']]
    X = df[feature_cols].values
    y = df['car purchase amount'].values

    # 3. Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # 4. Build & fit model
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    model = LinearRegression()
    model.fit(X_train_scaled, y_train)

    # 5. Evaluate
    X_test_scaled = scaler.transform(X_test)
    preds = model.predict(X_test_scaled)
    print("MAE:", mean_squared_error(y_test, preds))
    print("R²:", r2_score(y_test, preds))

if __name__ == "__main__":
    main()


MAE: 2.8176137271814223
R²: 0.9999999739044987
