#FNN Training

In [16]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Dense, Dropout
import pickle
from google.colab import drive

class FNNModel:
    def __init__(self):
        self.model = None
        self.scaler = StandardScaler()
        self.label_encoders = {}

    def preprocess_data(self, df, is_training=True):
        # Create a copy of the dataframe to avoid modifying the original
        df_processed = df.copy()

        # Encode categorical columns
        categorical_cols = ['Mode_of_Shipment', 'Product_Importance']
        for col in categorical_cols:
            if is_training:
                le = LabelEncoder()
                df_processed[col] = le.fit_transform(df_processed[col])
                self.label_encoders[col] = le
            else:
                df_processed[col] = self.label_encoders[col].transform(df_processed[col])

        # Separate features and target
        X = df_processed.drop('Reached_on_Time', axis=1)
        y = df_processed['Reached_on_Time']

        # Scale numerical features
        if is_training:
            X = self.scaler.fit_transform(X)
        else:
            X = self.scaler.transform(X)

        return X, y

    def build_model(self, input_dim):
        self.model = Sequential([
            Dense(64, activation='relu', input_dim=input_dim),
            Dropout(0.3),
            Dense(32, activation='relu'),
            Dropout(0.2),
            Dense(1, activation='sigmoid')
        ])
        self.model.compile(optimizer='adam',
                          loss='binary_crossentropy',
                          metrics=['accuracy'])

    def train(self, train_path, val_path, save_path, epochs=20, batch_size=32):
        # Load and preprocess data
        train_df = pd.read_csv(train_path)
        val_df = pd.read_csv(val_path)

        X_train, y_train = self.preprocess_data(train_df, is_training=True)
        X_val, y_val = self.preprocess_data(val_df, is_training=False)

        # Build model
        self.build_model(input_dim=X_train.shape[1])

        # Train model
        history = self.model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size
        )

        # Save the model and preprocessing objects
        self.save_model(save_path)

        return history

    def save_model(self, save_path):
        """Save model and all preprocessing objects"""
        # Save Keras model
        self.model.save(save_path)

        # Save preprocessing objects
        preprocessors = {
            'scaler': self.scaler,
            'label_encoders': self.label_encoders
        }
        with open(save_path + '_preprocessors.pkl', 'wb') as f:
            pickle.dump(preprocessors, f)

    def load_model(self, model_path):
        """Load model and all preprocessing objects"""
        # Load Keras model
        self.model = load_model(model_path)

        # Load preprocessing objects
        with open(model_path + '_preprocessors.pkl', 'rb') as f:
            preprocessors = pickle.load(f)
            self.scaler = preprocessors['scaler']
            self.label_encoders = preprocessors['label_encoders']

    def predict(self, data_path, output_path=None):
        # Load and preprocess data
        df = pd.read_csv(data_path)
        X, _ = self.preprocess_data(df, is_training=False)

        # Make predictions
        predictions = self.model.predict(X)
        df['Predictions'] = (predictions > 0.5).astype(int)

        # Save predictions
        if output_path:
            df.to_csv(output_path, index=False)
            print(f"Predictions saved to {output_path}")

        return df

if __name__ == "__main__":
    # Mount Google Drive
    drive.mount('/content/gdrive')
    absolute_path = '/content/gdrive/My Drive/Projects/SupplyChainPredictiveAnalytics/'

    fnn = FNNModel()

    # For training:
    # fnn.train(
    #     train_path=absolute_path+'train_set.csv',
    #     val_path=absolute_path+'validation_set.csv',
    #     save_path=absolute_path+'fnn_model.h5',
    #     epochs=50
    # )

    # For prediction:
    # fnn.load_model(absolute_path+'fnn_model.h5')
    # predictions = fnn.predict(
    #     data_path=absolute_path+'test_set.csv',
    #     output_path=absolute_path+'predictions.csv'
    # )
    # print(predictions)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).




[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Predictions saved to /content/gdrive/My Drive/Projects/SupplyChainPredictiveAnalytics/predictions.csv
     Mode_of_Shipment  Customer_Care_Calls  Cost_of_the_Product  \
0                Ship                    3                  166   
1                Ship                    3                  242   
2                Ship                    5                  167   
3                Ship                    4                  219   
4                Ship                    5                  245   
...               ...                  ...                  ...   
1755           Flight                    3                  153   
1756             Ship                    2                  149   
1757             Ship                    3                  214   
1758           Flight                    4                  154   
1759             Ship                    3                  255   

     Product_Importa

#Feature Engineering

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.metrics import mutual_info_score
from google.colab import drive

class FeatureAnalyzer:
    def __init__(self, df, target_col='Reached_on_Time'):
        self.df = df.copy()
        self.target_col = target_col
        self.numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
        self.categorical_cols = df.select_dtypes(include=['object']).columns

    def generate_basic_stats(self):
        """Generate basic statistics for all columns"""
        stats_dict = {
            'total_records': len(self.df),
            'on_time_rate': (self.df[self.target_col].mean() * 100).round(2),
            'numeric_stats': self.df[self.numeric_cols].describe(),
            'categorical_counts': {col: self.df[col].value_counts(normalize=True) * 100
                                 for col in self.categorical_cols}
        }
        return stats_dict

    def analyze_numeric_features(self):
        """Analyze relationship between numeric features and target"""
        numeric_insights = {}

        for col in self.numeric_cols:
            if col != self.target_col:
                # Calculate statistics for on-time vs delayed shipments
                on_time_stats = self.df[self.df[self.target_col] == 1][col].describe()
                delayed_stats = self.df[self.df[self.target_col] == 0][col].describe()

                # Perform t-test
                t_stat, p_value = stats.ttest_ind(
                    self.df[self.df[self.target_col] == 1][col],
                    self.df[self.df[self.target_col] == 0][col]
                )

                # Calculate correlation
                correlation = self.df[col].corr(self.df[self.target_col])

                numeric_insights[col] = {
                    'on_time_stats': on_time_stats,
                    'delayed_stats': delayed_stats,
                    't_statistic': t_stat,
                    'p_value': p_value,
                    'correlation': correlation
                }

                # Create visualization
                plt.figure(figsize=(10, 6))
                sns.boxplot(x=self.target_col, y=col, data=self.df)
                plt.title(f'Distribution of {col} by Delivery Status')
                plt.savefig(absolute_path+ f'boxplot_{col}.png')
                plt.close()

        return numeric_insights

    def analyze_categorical_features(self):
        """Analyze relationship between categorical features and target"""
        categorical_insights = {}

        for col in self.categorical_cols:
            # Calculate delivery success rate by category
            success_rates = self.df.groupby(col)[self.target_col].mean() * 100

            # Calculate chi-square test
            contingency_table = pd.crosstab(self.df[col], self.df[self.target_col])
            chi2, p_value = stats.chi2_contingency(contingency_table)[:2]

            # Calculate mutual information score
            mi_score = mutual_info_score(self.df[col], self.df[self.target_col])

            categorical_insights[col] = {
                'success_rates': success_rates,
                'chi2_statistic': chi2,
                'p_value': p_value,
                'mutual_info_score': mi_score
            }

            # Create visualization
            plt.figure(figsize=(12, 6))
            success_rates.plot(kind='bar')
            plt.title(f'Delivery Success Rate by {col}')
            plt.ylabel('Success Rate (%)')
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.savefig(absolute_path+ f'success_rate_{col}.png')
            plt.close()

        return categorical_insights

    def generate_correlation_matrix(self):
        """Generate correlation matrix for numeric features"""
        plt.figure(figsize=(12, 8))
        correlation_matrix = self.df[self.numeric_cols].corr()
        sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
        plt.title('Feature Correlation Matrix')
        plt.tight_layout()
        plt.savefig(absolute_path+'correlation_matrix.png')
        plt.close()
        return correlation_matrix

    def generate_summary_report(self):
        """Generate a comprehensive summary report"""
        basic_stats = self.generate_basic_stats()
        numeric_insights = self.analyze_numeric_features()
        categorical_insights = self.analyze_categorical_features()
        correlation_matrix = self.generate_correlation_matrix()

        report = {
            'basic_stats': basic_stats,
            'numeric_insights': numeric_insights,
            'categorical_insights': categorical_insights,
            'correlation_matrix': correlation_matrix
        }

        return report

if __name__ == "__main__":
    drive.mount('/content/gdrive')
    absolute_path = '/content/gdrive/My Drive/Projects/SupplyChainPredictiveAnalytics/'

    # Load your dataset
    df = pd.read_csv(absolute_path+'test_set.csv')

    # Create analyzer instance
    analyzer = FeatureAnalyzer(df)

    # Generate comprehensive report
    report = analyzer.generate_summary_report()

    # Print key insights
    print(f"Dataset size: {report['basic_stats']['total_records']} records")
    print(f"Overall on-time delivery rate: {report['basic_stats']['on_time_rate']}%")

    # Print insights for each numeric feature
    for feature, insights in report['numeric_insights'].items():
        print(f"\nFeature: {feature}")
        print(f"Correlation with on-time delivery: {insights['correlation']:.3f}")
        print(f"Statistical significance (p-value): {insights['p_value']:.3f}")

    # Print insights for each categorical feature
    for feature, insights in report['categorical_insights'].items():
        print(f"\nFeature: {feature}")
        print("Success rates by category:")
        print(insights['success_rates'])
        print(f"Chi-square p-value: {insights['p_value']:.3f}")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
Dataset size: 1760 records
Overall on-time delivery rate: 50.0%

Feature: Customer_Care_Calls
Correlation with on-time delivery: -0.091
Statistical significance (p-value): 0.000

Feature: Cost_of_the_Product
Correlation with on-time delivery: -0.073
Statistical significance (p-value): 0.002

Feature: Weight_in_Grams
Correlation with on-time delivery: -0.252
Statistical significance (p-value): 0.000

Feature: Mode_of_Shipment
Success rates by category:
Mode_of_Shipment
Flight    52.671756
Road      50.175439
Ship      49.381698
Name: Reached_on_Time, dtype: float64
Chi-square p-value: 0.626

Feature: Product_Importance
Success rates by category:
Product_Importance
high      52.666667
low       50.602410
medium    48.846154
Name: Reached_on_Time, dtype: float64
Chi-square p-value: 0.618
