<a href="https://colab.research.google.com/github/AnalyzrXs/Sales-Predictions/blob/main/sales.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

 Loading the Dataset

In [None]:
data = pd.read_csv("/content/car.csv",encoding='latin-1')

 Basic Data Inspection

In [None]:
data.head()

Unnamed: 0,customer name,customer e-mail,country,gender,age,annual Salary,credit card debt,net worth,car purchase amount
0,Martina Avila,cubilia.Curae.Phasellus@quisaccumsanconvallis.edu,Bulgaria,0,41.85172,62812.09301,11609.38091,238961.2505,35321.45877
1,Harlan Barnes,eu.dolor@diam.co.uk,Belize,0,40.870623,66646.89292,9572.957136,530973.9078,45115.52566
2,Naomi Rodriquez,vulputate.mauris.sagittis@ametconsectetueradip...,Algeria,1,43.152897,53798.55112,11160.35506,638467.1773,42925.70921
3,Jade Cunningham,malesuada@dignissim.com,Cook Islands,1,58.271369,79370.03798,14426.16485,548599.0524,67422.36313
4,Cedric Leach,felis.ullamcorper.viverra@egetmollislectus.net,Brazil,1,57.313749,59729.1513,5358.712177,560304.0671,55915.46248


In [None]:
data.shape

(500, 9)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer name        500 non-null    object 
 1   customer e-mail      500 non-null    object 
 2   country              500 non-null    object 
 3   gender               500 non-null    int64  
 4   age                  500 non-null    float64
 5   annual Salary        500 non-null    float64
 6   credit card debt     500 non-null    float64
 7   net worth            500 non-null    float64
 8   car purchase amount  500 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 35.3+ KB


In [None]:
data.isnull().sum()

Unnamed: 0,0
customer name,0
customer e-mail,0
country,0
gender,0
age,0
annual Salary,0
credit card debt,0
net worth,0
car purchase amount,0


 Preprocessing Function

In [None]:
from sklearn.preprocessing import StandardScaler

def preprocess_customer_data(data):
    df = data.copy()

    df.drop(columns=['customer name', 'customer e-mail'], inplace=True)

    df['gender'] = df['gender'].map({'Female': 0, 'Male': 1})

    df = pd.get_dummies(df, columns=['country'], drop_first=True)

    X = df.drop(columns='car purchase amount')
    y = df['car purchase amount']

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

    return X_scaled, y, scaler


Impute Missing Values

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer # Import SimpleImputer for handling NaNs

X_scaled, y, scaler = preprocess_customer_data(data)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean') # Create an imputer instance
X_scaled = imputer.fit_transform(X_scaled) # Fit and transform to impute NaNs



Split the Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


Define Models

 Performance Metrics Calculated:
For each model, the script computes:

MAE (Mean Absolute Error) – average of absolute differences between predicted and actual values.

RMSE (Root Mean Squared Error) – square root of the average of squared errors, penalizing large errors more.

R² Score (Coefficient of Determination) – indicates how well predictions match actual values (closer to 1 is better).



In [None]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42),
    "LightGBM": LGBMRegressor(n_estimators=100, random_state=42)
}

# Evaluate
results = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    results.append({
        "Model": name,
        "MAE": mae,
        "RMSE": rmse,
        "R² Score": r2
    })

# Display results
results_df = pd.DataFrame(results).sort_values(by='RMSE')
print(results_df)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000310 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 536
[LightGBM] [Info] Number of data points in the train set: 400, number of used features: 4
[LightGBM] [Info] Start training from score 43946.070740
               Model          MAE         RMSE  R² Score
0  Linear Regression     1.271805     1.675880  1.000000
3           LightGBM  1160.065760  1669.549458  0.974184
2            XGBoost  1570.350422  2178.132498  0.956061
1      Random Forest  1654.303877  2364.139861  0.948236


Save the Best Model

In [None]:
import os
import joblib

best_model = RandomForestRegressor(n_estimators=100, random_state=42)
best_model.fit(X_scaled, y)

if not os.path.exists("models"):
    os.makedirs("models")

joblib.dump(best_model, "models/final_model.pkl")
joblib.dump(scaler, "models/scaler.pkl")
print("Model and scaler saved.")

Model and scaler saved.
