In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import logging

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import joblib

In [2]:
# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(message)s')


In [3]:
# Step 1: Load the Dataset
def load_dataset(file_path):
    logging.info("Loading dataset...")
    data = pd.read_csv(file_path)
    logging.info(f"Dataset loaded with {data.shape[0]} rows and {data.shape[1]} columns.")
    return data

In [4]:
# Step 2: Data Cleaning
def clean_data(data):
    logging.info("Cleaning data...")
    
    # Remove duplicates
    data.drop_duplicates(inplace=True)
    
    # Check for and handle missing values
    logging.info("Missing values before cleaning:")
    logging.info(data.isnull().sum())
    
    # Remove outliers using IQR method
    def remove_outliers(df, columns):
        for col in columns:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
        return df
    
    numerical_columns = ['TV', 'Radio', 'Newspaper', 'Sales']
    data = remove_outliers(data, numerical_columns)
    
    logging.info(f"Data cleaned. Remaining rows: {data.shape[0]}.")
    return data

In [5]:
# Step 3: Exploratory Data Analysis (EDA)
def perform_eda(data):
    logging.info("Performing EDA...")
    
    # Correlation Heatmap
    plt.figure(figsize=(10, 8))
    sns.heatmap(data.corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Feature Correlation Heatmap')
    plt.tight_layout()
    plt.savefig('correlation_heatmap.png')
    plt.close()
    
    # Pairplot
    plt.figure(figsize=(12, 10))
    sns.pairplot(data, diag_kind='kde')
    plt.suptitle('Pairwise Relationships', y=1.02)
    plt.tight_layout()
    plt.savefig('pairplot.png')
    plt.close()
    
    # Scatter plots of features vs Sales
    plt.figure(figsize=(15, 5))
    for i, col in enumerate(['TV', 'Radio', 'Newspaper'], 1):
        plt.subplot(1, 3, i)
        sns.scatterplot(x=col, y='Sales', data=data)
        plt.title(f'{col} vs Sales')
    plt.tight_layout()
    plt.savefig('feature_vs_sales.png')
    plt.close()
    
    # Print correlation with Sales
    logging.info("\nCorrelation with Sales:")
    logging.info(data.corr()['Sales'].sort_values(ascending=False))

In [6]:
# Step 4: Feature Engineering
def feature_engineering(data):
    logging.info("Performing feature engineering...")
    
    X = data.drop('Sales', axis=1)
    y = data['Sales']
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Add polynomial features
    poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
    X_poly = poly.fit_transform(X_scaled)
    
    return X_poly, y, scaler, poly

In [7]:
# Step 5: Model Training and Evaluation
def train_and_evaluate_models(X, y):
    logging.info("Splitting data...")
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    logging.info("Training models...")
    models = {
        'Random Forest': RandomForestRegressor(random_state=42),
        'Gradient Boosting': GradientBoostingRegressor(random_state=42),
        'Linear Regression': LinearRegression()
    }
    
    predictions = {}
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        predictions[name] = y_pred
        r2 = r2_score(y_test, y_pred)
        logging.info(f"{name} R2 Score: {r2:.2f}")
    
    # Hyperparameter Tuning for Random Forest
    param_grid_rf = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), 
                                  param_grid_rf, cv=3, scoring='r2', n_jobs=-1)
    grid_search_rf.fit(X_train, y_train)
    best_rf = grid_search_rf.best_estimator_
    
    # Evaluate Best Model
    y_pred = best_rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    logging.info("Evaluating best model...")
    print("\nBest Model Evaluation:")
    print(f"Mean Squared Error: {mse:.2f}")
    print(f"R-Squared Score: {r2:.2f}")
    print(f"Mean Absolute Error: {mae:.2f}")
    
    # Residual Plot
    residuals = y_test - y_pred
    plt.figure(figsize=(10, 6))
    sns.histplot(residuals, kde=True, bins=30, color='blue')
    plt.title('Residuals Distribution')
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.savefig('residuals_distribution.png')
    plt.close()
    
    # Actual vs Predicted Plot
    plt.figure(figsize=(12, 8))
    for name, y_pred in predictions.items():
        plt.scatter(y_test, y_pred, alpha=0.6, label=name)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', color='red', label='Perfect Prediction')
    plt.title('Actual vs Predicted Sales for All Models')
    plt.xlabel('Actual Sales')
    plt.ylabel('Predicted Sales')
    plt.legend()
    plt.savefig('actual_vs_predicted.png')
    plt.close()
    
    return best_rf, grid_search_rf.best_params_

In [8]:
# Main Execution
def main(file_path=r'F:\CodSoft\SalesPrediction\advertising.csv'):
    # Full Pipeline
    data = load_dataset(file_path)
    cleaned_data = clean_data(data)
    perform_eda(cleaned_data)
    
    X, y, scaler, poly = feature_engineering(cleaned_data)
    best_model, best_params = train_and_evaluate_models(X, y)
    
    logging.info("Saving the best model...")
    joblib.dump({
        'model': best_model,
        'scaler': scaler,
        'poly_features': poly
    }, 'sales_prediction_model.pkl')
    
    logging.info("Best Hyperparameters:")
    print(best_params)
    logging.info("Model saved as sales_prediction_model.pkl")

if __name__ == "__main__":
    main()

2024-12-20 21:45:25,106 - Loading dataset...
2024-12-20 21:45:25,128 - Dataset loaded with 200 rows and 4 columns.
2024-12-20 21:45:25,129 - Cleaning data...
2024-12-20 21:45:25,131 - Missing values before cleaning:
2024-12-20 21:45:25,132 - TV           0
Radio        0
Newspaper    0
Sales        0
dtype: int64
2024-12-20 21:45:25,137 - Data cleaned. Remaining rows: 198.
2024-12-20 21:45:25,137 - Performing EDA...
2024-12-20 21:45:27,792 - 
Correlation with Sales:
2024-12-20 21:45:27,792 - Sales        1.000000
TV           0.899974
Radio        0.348566
Newspaper    0.151764
Name: Sales, dtype: float64
2024-12-20 21:45:27,792 - Performing feature engineering...
2024-12-20 21:45:27,801 - Splitting data...
2024-12-20 21:45:27,801 - Training models...
2024-12-20 21:45:27,927 - Random Forest R2 Score: 0.96
2024-12-20 21:45:27,986 - Gradient Boosting R2 Score: 0.95
2024-12-20 21:45:27,994 - Linear Regression R2 Score: 0.93
2024-12-20 21:45:32,812 - Evaluating best model...
  plt.plot([y_


Best Model Evaluation:
Mean Squared Error: 1.14
R-Squared Score: 0.96
Mean Absolute Error: 0.84


2024-12-20 21:45:33,141 - Saving the best model...
2024-12-20 21:45:33,187 - Best Hyperparameters:
2024-12-20 21:45:33,187 - Model saved as sales_prediction_model.pkl


{'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}


<Figure size 1200x1000 with 0 Axes>