In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import warnings
import tkinter as tk
from tkinter import ttk
import pickle
import os

warnings.filterwarnings('ignore')

# Set random state for reproducibility
RANDOM_STATE = 42

def load_data(file_path):
    # Try these encodings one by one
    encodings_to_try = ['utf-8', 'latin-1', 'cp1252', 'ISO-8859-1', 'windows-1252']
    
    for encoding in encodings_to_try:
        try:
            print(f"Trying {encoding} encoding...")
            df = pd.read_csv(file_path, encoding=encoding)
            print(f"Success! File loaded with {encoding} encoding")
            return df
        except Exception as e:
            print(f"Failed with encoding {encoding}: {str(e)}")
    
    raise Exception("Failed to load file with any of the attempted encodings")

# Load the dataset
try:
    df = load_data('Downloads/car_purchasing.csv')
except Exception as e:
    print(f"Error loading data: {str(e)}")
    print("Using sample data instead...")
    # Create sample data if file loading fails
    np.random.seed(RANDOM_STATE)
    n_samples = 500
    
    # Generate synthetic data
    df = pd.DataFrame({
        'customer name': [f'Customer_{i}' for i in range(1, n_samples+1)],
        'customer e-mail': [f'customer_{i}@example.com' for i in range(1, n_samples+1)],
        'country': np.random.choice(['United States', 'Canada', 'UK', 'Germany', 'France'], n_samples),
        'gender': np.random.choice(['Male', 'Female'], n_samples),
        'age': np.random.randint(18, 70, n_samples),
        'annual Salary': np.random.normal(60000, 20000, n_samples),
        'credit card debt': np.random.normal(5000, 3000, n_samples),
        'net worth': np.random.normal(500000, 200000, n_samples),
    })
    
    # Created target variable with some relationship to the features
    df['car purchase amount'] = (
        0.1 * df['age'] + 
        0.4 * df['annual Salary'] / 10000 + 
        0.1 * df['credit card debt'] / 1000 + 
        0.3 * df['net worth'] / 100000 + 
        np.random.normal(0, 5, n_samples)
    ) * 1000

# Display basic information about the dataset
print("\n===== DATASET OVERVIEW =====")
print("\nFirst 5 rows:")
print(df.head())

print("\nDataset info:")
print(df.info())

print("\nDescriptive statistics:")
print(df.describe())

print("\nMissing values check:")
print(df.isnull().sum())

# Handle missing values if any
if df.isnull().sum().sum() > 0:
    print("Handling missing values...")
    df = df.dropna()  # Or use imputation strategies

# Feature Engineering
print("\n===== FEATURE ENGINEERING =====")
# Add new features
df['debt_to_income_ratio'] = df['credit card debt'] / (df['annual Salary'] + 1)  # Add 1 to avoid division by zero
df['net_worth_to_salary_ratio'] = df['net worth'] / (df['annual Salary'] + 1)

# Create age groups
df['age_group'] = pd.cut(df['age'], bins=[0, 25, 35, 45, 55, 65, 100], 
                         labels=['Under 25', '25-35', '35-45', '45-55', '55-65', 'Over 65'])

# Show the engineered features
print("\nEngineered features (first 5 rows):")
print(df[['debt_to_income_ratio', 'net_worth_to_salary_ratio', 'age_group']].head())

# Prepare data for modeling
print("\n===== MODEL PREPARATION =====")
# Separate features from target
X = df.drop(['customer name', 'customer e-mail', 'car purchase amount', 'age_group'], axis=1)
y = df['car purchase amount']

# Split categorical and numerical features
categorical_features = ['country', 'gender']
numerical_features = ['age', 'annual Salary', 'credit card debt', 'net worth', 
                      'debt_to_income_ratio', 'net_worth_to_salary_ratio']

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_features)
    ])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

# Define models with regularization to prevent overfitting
models = {
    "Ridge Regression": Ridge(alpha=1.0, random_state=RANDOM_STATE),
    "Lasso Regression": Lasso(alpha=0.1, random_state=RANDOM_STATE),
    "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=RANDOM_STATE),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=10, min_samples_leaf=5, random_state=RANDOM_STATE),
    "XGBoost": xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.05, 
                             subsample=0.8, colsample_bytree=0.8, random_state=RANDOM_STATE)
}

# Dictionary to store results
results = {}

# Evaluate models with cross-validation
print("\n===== MODEL TRAINING WITH CROSS-VALIDATION =====")
kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

for name, model in models.items():
    print(f"\nEvaluating {name} with cross-validation...")
    
    # Create pipeline with preprocessing
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Perform cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='r2')
    
    print(f"Cross-validation R² scores: {cv_scores}")
    print(f"Mean CV R² score: {cv_scores.mean():.4f}")
    print(f"Standard deviation: {cv_scores.std():.4f}")
    
    # Train model on full training set
    pipeline.fit(X_train, y_train)
    
    # Evaluate on test set
    y_pred = pipeline.predict(X_test)
    
    # Calculate metrics
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {
        'RMSE': rmse,
        'MAE': mae,
        'R2': r2,
        'CV_R2_mean': cv_scores.mean(),
        'CV_R2_std': cv_scores.std(),
        'pipeline': pipeline
    }
    
    print(f"Test set - RMSE: {rmse:.2f}, MAE: {mae:.2f}, R2: {r2:.4f}")
    
    # Check for overfitting
    train_r2 = r2_score(y_train, pipeline.predict(X_train))
    print(f"Training R²: {train_r2:.4f}, Test R²: {r2:.4f}")
    print(f"Difference (Train - Test): {train_r2 - r2:.4f}")
    
    if train_r2 - r2 > 0.1:
        print("WARNING: Possible overfitting detected!")

# Find the best performing model based on test R2
best_model_name = max(results, key=lambda x: results[x]['R2'])
print(f"\nBest performing model: {best_model_name}")
print(f"R² on test set: {results[best_model_name]['R2']:.4f}")
print(f"Cross-validation R²: {results[best_model_name]['CV_R2_mean']:.4f} ± {results[best_model_name]['CV_R2_std']:.4f}")

# Get the best model
final_model = results[best_model_name]['pipeline']

# Save the model for use in the GUI
model_filename = 'car_purchase_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(final_model, file)
print(f"\nModel saved as {model_filename}")

# Evaluate the final model
y_pred_final = final_model.predict(X_test)
final_rmse = np.sqrt(mean_squared_error(y_test, y_pred_final))
final_mae = mean_absolute_error(y_test, y_pred_final)
final_r2 = r2_score(y_test, y_pred_final)

print("\n===== FINAL MODEL PERFORMANCE =====")
print(f"Final model - RMSE: {final_rmse:.2f}, MAE: {final_mae:.2f}, R2: {final_r2:.4f}")

# Create a GUI application for prediction
print("\n===== CREATING GUI INTERFACE =====")

def create_gui_application():
    # Create the main window
    root = tk.Tk()
    root.title("Car Purchase Amount Predictor")
    root.geometry("600x500")
    root.configure(bg="#f0f0f0")
    
    # Add a title label
    title_label = tk.Label(root, text="Car Purchase Amount Prediction", font=("Arial", 16, "bold"), bg="#f0f0f0")
    title_label.pack(pady=10)
    
    # Create a frame for input fields
    input_frame = ttk.Frame(root, padding="10")
    input_frame.pack(fill="both", expand=True, padx=20, pady=10)
    
    # Create input fields with labels
    ttk.Label(input_frame, text="Age:").grid(column=0, row=0, sticky=tk.W, pady=5)
    age_var = tk.IntVar(value=35)
    age_entry = ttk.Entry(input_frame, textvariable=age_var)
    age_entry.grid(column=1, row=0, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Annual Salary ($):").grid(column=0, row=1, sticky=tk.W, pady=5)
    salary_var = tk.DoubleVar(value=60000)
    salary_entry = ttk.Entry(input_frame, textvariable=salary_var)
    salary_entry.grid(column=1, row=1, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Credit Card Debt ($):").grid(column=0, row=2, sticky=tk.W, pady=5)
    debt_var = tk.DoubleVar(value=5000)
    debt_entry = ttk.Entry(input_frame, textvariable=debt_var)
    debt_entry.grid(column=1, row=2, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Net Worth ($):").grid(column=0, row=3, sticky=tk.W, pady=5)
    net_worth_var = tk.DoubleVar(value=500000)
    net_worth_entry = ttk.Entry(input_frame, textvariable=net_worth_var)
    net_worth_entry.grid(column=1, row=3, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Gender:").grid(column=0, row=4, sticky=tk.W, pady=5)
    gender_var = tk.StringVar(value="Male")
    gender_combobox = ttk.Combobox(input_frame, textvariable=gender_var, values=["Male", "Female"])
    gender_combobox.grid(column=1, row=4, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Country:").grid(column=0, row=5, sticky=tk.W, pady=5)
    country_var = tk.StringVar(value="United States")
    country_combobox = ttk.Combobox(input_frame, textvariable=country_var, 
                                   values=["United States", "Canada", "UK", "Germany", "France"])
    country_combobox.grid(column=1, row=5, sticky=(tk.W, tk.E), pady=5)
    
    # Create a frame for the output
    output_frame = ttk.Frame(root, padding="10")
    output_frame.pack(fill="both", expand=True, padx=20, pady=10)
    
    # Create output label
    result_var = tk.StringVar(value="")
    result_label = ttk.Label(output_frame, textvariable=result_var, font=("Arial", 12))
    result_label.pack(pady=10)
    
    # Create a prediction function
    def predict():
        try:
            # Load the model
            if os.path.exists(model_filename):
                with open(model_filename, 'rb') as file:
                    model = pickle.load(file)
                
                # Get input values
                age = age_var.get()
                salary = salary_var.get()
                debt = debt_var.get()
                net_worth = net_worth_var.get()
                gender = gender_var.get()
                country = country_var.get()
                
                # Calculate derived features
                debt_to_income = debt / (salary + 1)
                net_worth_to_salary = net_worth / (salary + 1)
                
                # Create input dataframe
                input_data = pd.DataFrame({
                    'age': [age],
                    'annual Salary': [salary],
                    'credit card debt': [debt],
                    'net worth': [net_worth],
                    'gender': [gender],
                    'country': [country],
                    'debt_to_income_ratio': [debt_to_income],
                    'net_worth_to_salary_ratio': [net_worth_to_salary]
                })
                
                # Make prediction
                prediction = model.predict(input_data)[0]
                
                # Update result
                result_var.set(f"Predicted Car Purchase Amount: ${prediction:,.2f}")
            else:
                result_var.set("Error: Model file not found")
        except Exception as e:
            result_var.set(f"Error in prediction: {str(e)}")
    
    # Create predict button
    predict_button = ttk.Button(output_frame, text="Predict", command=predict)
    predict_button.pack(pady=5)
    
    # Create reset button
    def reset():
        age_var.set(35)
        salary_var.set(60000)
        debt_var.set(5000)
        net_worth_var.set(500000)
        gender_var.set("Male")
        country_var.set("United States")
        result_var.set("")
    
    reset_button = ttk.Button(output_frame, text="Reset", command=reset)
    reset_button.pack(pady=5)
    
    # Run the application
    root.mainloop()

# Create a separate file for the GUI application
gui_code = """
import tkinter as tk
from tkinter import ttk
import pandas as pd
import pickle
import os

def main():
    # Create the main window
    root = tk.Tk()
    root.title("Car Purchase Amount Predictor")
    root.geometry("600x500")
    root.configure(bg="#f0f0f0")
    
    # Add a title label
    title_label = tk.Label(root, text="Car Purchase Amount Prediction", font=("Arial", 16, "bold"), bg="#f0f0f0")
    title_label.pack(pady=10)
    
    # Create a frame for input fields
    input_frame = ttk.Frame(root, padding="10")
    input_frame.pack(fill="both", expand=True, padx=20, pady=10)
    
    # Create input fields with labels
    ttk.Label(input_frame, text="Age:").grid(column=0, row=0, sticky=tk.W, pady=5)
    age_var = tk.IntVar(value=35)
    age_entry = ttk.Entry(input_frame, textvariable=age_var)
    age_entry.grid(column=1, row=0, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Annual Salary ($):").grid(column=0, row=1, sticky=tk.W, pady=5)
    salary_var = tk.DoubleVar(value=60000)
    salary_entry = ttk.Entry(input_frame, textvariable=salary_var)
    salary_entry.grid(column=1, row=1, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Credit Card Debt ($):").grid(column=0, row=2, sticky=tk.W, pady=5)
    debt_var = tk.DoubleVar(value=5000)
    debt_entry = ttk.Entry(input_frame, textvariable=debt_var)
    debt_entry.grid(column=1, row=2, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Net Worth ($):").grid(column=0, row=3, sticky=tk.W, pady=5)
    net_worth_var = tk.DoubleVar(value=500000)
    net_worth_entry = ttk.Entry(input_frame, textvariable=net_worth_var)
    net_worth_entry.grid(column=1, row=3, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Gender:").grid(column=0, row=4, sticky=tk.W, pady=5)
    gender_var = tk.StringVar(value="Male")
    gender_combobox = ttk.Combobox(input_frame, textvariable=gender_var, values=["Male", "Female"])
    gender_combobox.grid(column=1, row=4, sticky=(tk.W, tk.E), pady=5)
    
    ttk.Label(input_frame, text="Country:").grid(column=0, row=5, sticky=tk.W, pady=5)
    country_var = tk.StringVar(value="United States")
    country_combobox = ttk.Combobox(input_frame, textvariable=country_var, 
                                   values=["United States", "Canada", "UK", "Germany", "France"])
    country_combobox.grid(column=1, row=5, sticky=(tk.W, tk.E), pady=5)
    
    # Create a frame for the output
    output_frame = ttk.Frame(root, padding="10")
    output_frame.pack(fill="both", expand=True, padx=20, pady=10)
    
    # Create output label
    result_var = tk.StringVar(value="")
    result_label = ttk.Label(output_frame, textvariable=result_var, font=("Arial", 12))
    result_label.pack(pady=10)
    
    # Create a prediction function
    def predict():
        try:
            # Load the model
            if os.path.exists('car_purchase_model.pkl'):
                with open('car_purchase_model.pkl', 'rb') as file:
                    model = pickle.load(file)
                
                # Get input values
                age = age_var.get()
                salary = salary_var.get()
                debt = debt_var.get()
                net_worth = net_worth_var.get()
                gender = gender_var.get()
                country = country_var.get()
                
                # Calculate derived features
                debt_to_income = debt / (salary + 1)
                net_worth_to_salary = net_worth / (salary + 1)
                
                # Create input dataframe
                input_data = pd.DataFrame({
                    'age': [age],
                    'annual Salary': [salary],
                    'credit card debt': [debt],
                    'net worth': [net_worth],
                    'gender': [gender],
                    'country': [country],
                    'debt_to_income_ratio': [debt_to_income],
                    'net_worth_to_salary_ratio': [net_worth_to_salary]
                })
                
                # Make prediction
                prediction = model.predict(input_data)[0]
                
                # Update result
                result_var.set(f"Predicted Car Purchase Amount: ${prediction:,.2f}")
            else:
                result_var.set("Error: Model file not found")
        except Exception as e:
            result_var.set(f"Error in prediction: {str(e)}")
    
    # Create predict button
    predict_button = ttk.Button(output_frame, text="Predict", command=predict)
    predict_button.pack(pady=5)
    
    # Create reset button
    def reset():
        age_var.set(35)
        salary_var.set(60000)
        debt_var.set(5000)
        net_worth_var.set(500000)
        gender_var.set("Male")
        country_var.set("United States")
        result_var.set("")
    
    reset_button = ttk.Button(output_frame, text="Reset", command=reset)
    reset_button.pack(pady=5)
    
    # Run the application
    root.mainloop()

if __name__ == "__main__":
    main()
"""

# Save the GUI application as a separate file
with open('car_purchase_predictor_gui.py', 'w') as file:
    file.write(gui_code)
print("GUI application saved as car_purchase_predictor_gui.py")

# Notify the user how to run the GUI
print("\nTo run the GUI application:")
print("1. Make sure the model file 'car_purchase_model.pkl' exists in the same directory")
print("2. Run 'python car_purchase_predictor_gui.py'")

# Check if running in a notebook or interactive environment where we can launch the GUI directly
try:
    if __name__ == "__main__":
        create_gui_application()
except:
    pass

print("\n===== ANALYSIS COMPLETE =====")

Trying utf-8 encoding...
Failed with encoding utf-8: 'utf-8' codec can't decode byte 0xc5 in position 0: invalid continuation byte
Trying latin-1 encoding...
Success! File loaded with latin-1 encoding

===== DATASET OVERVIEW =====

First 5 rows:
     customer name                                    customer e-mail  \
0    Martina Avila  cubilia.Curae.Phasellus@quisaccumsanconvallis.edu   
1    Harlan Barnes                                eu.dolor@diam.co.uk   
2  Naomi Rodriquez  vulputate.mauris.sagittis@ametconsectetueradip...   
3  Jade Cunningham                            malesuada@dignissim.com   
4     Cedric Leach     felis.ullamcorper.viverra@egetmollislectus.net   

        country  gender        age  annual Salary  credit card debt  \
0      Bulgaria       0  41.851720    62812.09301      11609.380910   
1        Belize       0  40.870623    66646.89292       9572.957136   
2       Algeria       1  43.152897    53798.55112      11160.355060   
3  Cook Islands       1  58.271