In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the data
file_path = 'C:\\Users\\user\\Desktop\\GitHub Projects\\Project Files\\Projects\\Capstone 3\\Amazon Sale Report with Clusters.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the data
print(data.head())

# Select relevant features
features = data[['Amount', 'Qty', 'ship-country', 'ship-state', 'ship-city', 'Category', 'Cluster']]

# Create dummy features for categorical variables
features_encoded = pd.get_dummies(features, drop_first=True)

# Display the first few rows of the encoded data
print(features_encoded.head())


   index             Order ID      Date                        Status  \
0      0  405-8078784-5731545  04-30-22                     Cancelled   
1      1  171-9198151-1101146  04-30-22  Shipped - Delivered to Buyer   
2      2  404-0687676-7273146  04-30-22                       Shipped   
3      3  403-9615377-8133951  04-30-22                     Cancelled   
4      4  407-1069790-7240320  04-30-22                       Shipped   

  Fulfilment Sales Channel  ship-service-level    Style              SKU  \
0   Merchant      Amazon.in           Standard   SET389   SET389-KR-NP-S   
1   Merchant      Amazon.in           Standard  JNE3781  JNE3781-KR-XXXL   
2     Amazon      Amazon.in          Expedited  JNE3371    JNE3371-KR-XL   
3   Merchant      Amazon.in           Standard    J0341       J0341-DR-L   
4     Amazon      Amazon.in          Expedited  JNE3671  JNE3671-TU-XXXL   

        Category  ...  Amount    ship-city   ship-state  ship-postal-code  \
0            Set  ...  647.

In [7]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load the data
file_path = 'C:\\Users\\user\\Desktop\\GitHub Projects\\Project Files\\Projects\\Capstone 3\\Amazon Sale Report with Clusters.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the data
print(data.head())

# Select relevant features
features = data[['Amount', 'Qty', 'ship-country', 'ship-state', 'ship-city', 'Category', 'Cluster']]

# Create dummy features for categorical variables
features_encoded = pd.get_dummies(features, drop_first=True)

# Display the first few rows of the encoded data
print(features_encoded.head())

# Standardize the numeric features
scaler = StandardScaler()
numeric_cols = ['Amount', 'Qty']
features_encoded[numeric_cols] = scaler.fit_transform(features_encoded[numeric_cols])

# Display the first few rows of the scaled data
print(features_encoded.head())

# Split the data into training and testing datasets
X = features_encoded.drop(columns=['Cluster'])
y = features_encoded['Cluster']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the preprocessed training and testing datasets to CSV files
X_train.to_csv('C:\\Users\\user\\Desktop\\GitHub Projects\\Project Files\\Projects\\Capstone 3\\X_train.csv', index=False)
X_test.to_csv('C:\\Users\\user\\Desktop\\GitHub Projects\\Project Files\\Projects\\Capstone 3\\X_test.csv', index=False)
y_train.to_csv('C:\\Users\\user\\Desktop\\GitHub Projects\\Project Files\\Projects\\Capstone 3\\y_train.csv', index=False)
y_test.to_csv('C:\\Users\\user\\Desktop\\GitHub Projects\\Project Files\\Projects\\Capstone 3\\y_test.csv', index=False)


   index             Order ID      Date                        Status  \
0      0  405-8078784-5731545  04-30-22                     Cancelled   
1      1  171-9198151-1101146  04-30-22  Shipped - Delivered to Buyer   
2      2  404-0687676-7273146  04-30-22                       Shipped   
3      3  403-9615377-8133951  04-30-22                     Cancelled   
4      4  407-1069790-7240320  04-30-22                       Shipped   

  Fulfilment Sales Channel  ship-service-level    Style              SKU  \
0   Merchant      Amazon.in           Standard   SET389   SET389-KR-NP-S   
1   Merchant      Amazon.in           Standard  JNE3781  JNE3781-KR-XXXL   
2     Amazon      Amazon.in          Expedited  JNE3371    JNE3371-KR-XL   
3   Merchant      Amazon.in           Standard    J0341       J0341-DR-L   
4     Amazon      Amazon.in          Expedited  JNE3671  JNE3671-TU-XXXL   

        Category  ...  Amount    ship-city   ship-state  ship-postal-code  \
0            Set  ...  647.

In [9]:
import dask.dataframe as dd
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the data with Dask in smaller batches
file_path = 'C:/Users/user/Desktop/GitHub Projects/Project Files/Projects/Capstone 3/Amazon Sale Report with Clusters.csv'
try:
    data = dd.read_csv(file_path, blocksize="100MB")  # Adjust blocksize as needed
except Exception as e:
    print(f"Error loading data: {e}")
    raise

# Select relevant features
selected_features = ['Amount', 'Qty', 'Category', 'Cluster']
features = data[selected_features]

# Categorize the relevant columns before creating dummy variables
features['Category'] = features['Category'].astype('category')
features['Cluster'] = features['Cluster'].astype('category')

# Ensure known categories
features = features.categorize(columns=['Category', 'Cluster'])

# Create dummy features for categorical variables
features_encoded = dd.get_dummies(features, drop_first=True)

# Verify column names before and after transformation
print("Columns before transformation:", features.columns)
print("Columns after transformation:", features_encoded.columns)

# Handle missing values and standardize the features
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()

def preprocess_partition(df):
    df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)
    df_scaled = pd.DataFrame(scaler.fit_transform(df_imputed), columns=df.columns)
    return df_scaled

features_scaled = features_encoded.map_partitions(preprocess_partition)

# Convert to pandas dataframe for splitting and training
features_scaled = features_scaled.compute()

# Verify column names after preprocessing
print("Columns after preprocessing:", features_scaled.columns)

# Combine cluster dummy columns back into a single column
features_scaled['Cluster'] = features_scaled[['Cluster_1', 'Cluster_2', 'Cluster_3']].idxmax(axis=1)

# Drop the dummy cluster columns from the feature set
X = features_scaled.drop(columns=['Cluster_1', 'Cluster_2', 'Cluster_3', 'Cluster'])
y = features_scaled['Cluster']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the models
log_reg = LogisticRegression(random_state=42, max_iter=100)
random_forest = RandomForestClassifier(random_state=42)
svc = SVC(random_state=42)

models = {
    "Logistic Regression": log_reg,
    "Random Forest": random_forest,
    "SVC": svc
}

# Train and evaluate the models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    print(f"{name} Test Accuracy: {accuracy:.4f}")
    print(f"{name} Test Precision: {precision:.4f}")
    print(f"{name} Test Recall: {recall:.4f}")
    print(f"{name} Test F1 Score: {f1:.4f}")
    print("-" * 30)


Columns before transformation: Index(['Amount', 'Qty', 'Category', 'Cluster'], dtype='object')
Columns after transformation: Index(['Amount', 'Qty', 'Category_Bottom', 'Category_Dupatta',
       'Category_Ethnic Dress', 'Category_Saree', 'Category_Set',
       'Category_Top', 'Category_Western Dress', 'Category_kurta', 'Cluster_1',
       'Cluster_2', 'Cluster_3'],
      dtype='object')
Columns after preprocessing: Index(['Amount', 'Qty', 'Category_Bottom', 'Category_Dupatta',
       'Category_Ethnic Dress', 'Category_Saree', 'Category_Set',
       'Category_Top', 'Category_Western Dress', 'Category_kurta', 'Cluster_1',
       'Cluster_2', 'Cluster_3'],
      dtype='object')


  _warn_prf(average, modifier, msg_start, len(result))


Logistic Regression Test Accuracy: 0.9852
Logistic Regression Test Precision: 0.9705
Logistic Regression Test Recall: 0.9852
Logistic Regression Test F1 Score: 0.9778
------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


Random Forest Test Accuracy: 0.9851
Random Forest Test Precision: 0.9705
Random Forest Test Recall: 0.9851
Random Forest Test F1 Score: 0.9777
------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


SVC Test Accuracy: 0.9852
SVC Test Precision: 0.9705
SVC Test Recall: 0.9852
SVC Test F1 Score: 0.9778
------------------------------


## Feature Importance Analysis

In [11]:
importances = random_forest.feature_importances_
feature_names = X.columns
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances})
feature_importances = feature_importances.sort_values(by='importance', ascending=False)
print(feature_importances)


                  feature  importance
0                  Amount    0.968393
1                     Qty    0.011805
6            Category_Set    0.004894
7            Category_Top    0.004054
9          Category_kurta    0.003070
4   Category_Ethnic Dress    0.003063
8  Category_Western Dress    0.002975
2         Category_Bottom    0.001233
5          Category_Saree    0.000491
3        Category_Dupatta    0.000022


## Confusion Matrix and Classification Report

In [13]:
from sklearn.metrics import confusion_matrix, classification_report

for name, model in models.items():
    y_pred = model.predict(X_test)
    print(f"Confusion Matrix for {name}:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Classification Report for {name}:")
    print(classification_report(y_test, y_pred))
    print("-" * 30)


Confusion Matrix for Logistic Regression:
[[    0     0   356]
 [    0     0    27]
 [    0     0 25412]]
Classification Report for Logistic Regression:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   Cluster_1       0.00      0.00      0.00       356
   Cluster_2       0.00      0.00      0.00        27
   Cluster_3       0.99      1.00      0.99     25412

    accuracy                           0.99     25795
   macro avg       0.33      0.33      0.33     25795
weighted avg       0.97      0.99      0.98     25795

------------------------------
Confusion Matrix for Random Forest:
[[    0     0   356]
 [    0     0    27]
 [    2     0 25410]]
Classification Report for Random Forest:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   Cluster_1       0.00      0.00      0.00       356
   Cluster_2       0.00      0.00      0.00        27
   Cluster_3       0.99      1.00      0.99     25412

    accuracy                           0.99     25795
   macro avg       0.33      0.33      0.33     25795
weighted avg       0.97      0.99      0.98     25795

------------------------------
Confusion Matrix for SVC:
[[    0     0   356]
 [    0     0    27]
 [    0     0 25412]]
Classification Report for SVC:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

   Cluster_1       0.00      0.00      0.00       356
   Cluster_2       0.00      0.00      0.00        27
   Cluster_3       0.99      1.00      0.99     25412

    accuracy                           0.99     25795
   macro avg       0.33      0.33      0.33     25795
weighted avg       0.97      0.99      0.98     25795

------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


## Hyperparameter Tuning

In [15]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'Logistic Regression': {'C': [0.1, 1, 10]},
    'Random Forest': {'n_estimators': [100, 200], 'max_depth': [10, 20]},
    'SVC': {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
}

for name, model in models.items():
    grid_search = GridSearchCV(model, param_grid[name], cv=3, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best score for {name}: {grid_search.best_score_}")
    print("-" * 30)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters for Logistic Regression: {'C': 0.1}
Best score for Logistic Regression: 0.9859081218203
------------------------------
Best parameters for Random Forest: {'max_depth': 10, 'n_estimators': 200}
Best score for Random Forest: 0.9859081221020903
------------------------------
Best parameters for SVC: {'C': 0.1, 'kernel': 'linear'}
Best score for SVC: 0.9859081218203
------------------------------


## Cross Validation

In [17]:
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    print(f"Cross-Validation Scores for {name}: {scores}")
    print(f"Mean CV Score for {name}: {scores.mean()}")
    print("-" * 30)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Cross-Validation Scores for Logistic Regression: [0.98577244 0.98577244 0.98577244 0.98573367 0.98573367]
Mean CV Score for Logistic Regression: 0.9857569296375267
------------------------------
Cross-Validation Scores for Random Forest: [0.9858112  0.94150029 0.9854623  0.9855786  0.98565613]
Mean CV Score for Random Forest: 0.9768017057569296
------------------------------
Cross-Validation Scores for SVC: [0.98577244 0.98577244 0.98577244 0.98573367 0.98573367]
Mean CV Score for SVC: 0.9857569296375267
------------------------------


In [18]:
import joblib

joblib.dump(log_reg, 'logistic_regression_model.pkl')
joblib.dump(random_forest, 'random_forest_model.pkl')
joblib.dump(svc, 'svc_model.pkl')


['svc_model.pkl']

## Key Insights from the Model Results
### Feature Importance Analysis:

The most important feature is Amount, contributing significantly to the model's predictions.
Other features like Qty, Category_Set, and Category_Top have minor importance, indicating they also play a role but are less critical compared to Amount.

### Model Performance:

All three models (Logistic Regression, Random Forest, and SVC) show high accuracy, precision, recall, and F1 scores.
However, there is an imbalance in the prediction of clusters. Cluster 3 is predicted with high accuracy, while Clusters 1 and 2 are not predicted correctly at all.

### Hyperparameter Tuning:

The best hyperparameters for Logistic Regression include C=0.1.
For Random Forest, the best parameters are max_depth=10 and n_estimators=200.
Cross-Validation:

Cross-validation confirms the stability of the Logistic Regression model with a mean CV score of approximately 0.9858.
The Random Forest model also shows stability with a mean CV score of approximately 0.9768.

## Analysis of the Results
The analysis of the sales data using clustering and machine learning models has provided valuable insights into the key factors driving different sales patterns. The primary feature influencing cluster predictions is the sales Amount, with other features like Qty and various product categories playing lesser roles. The high accuracy, precision, recall, and F1 scores of the models—Logistic Regression, Random Forest, and SVC—indicate that the models perform well overall, with an accuracy of approximately 98.5%. However, the confusion matrices reveal that Clusters 1 and 2 are not predicted correctly, suggesting a significant imbalance in the dataset. This imbalance is reflected in the classification reports, where precision and recall for Clusters 1 and 2 are set to zero. The hyperparameter tuning results highlight the optimal settings for each model, further improving their performance. The cross-validation scores confirm the stability of the Logistic Regression and Random Forest models, with mean CV scores of approximately 0.9858 and 0.9768, respectively.

### Extended Modeling Plan
1. Data Collection:

Continuously collect and integrate additional sales data to expand the dataset. This can include new product categories, additional geographical regions, and more granular time-based data to enhance model performance.

2. Feature Engineering:

Explore additional features that could provide more insights into sales patterns. This includes customer demographics, seasonal trends, and external economic indicators. Utilize advanced feature engineering techniques to create meaningful features that can improve model accuracy.

3. Addressing Class Imbalance:

Implement techniques to handle class imbalance, such as Synthetic Minority Over-sampling Technique (SMOTE) or Adaptive Synthetic (ADASYN) sampling, to generate synthetic samples for underrepresented clusters. This will help the models learn to predict these clusters more accurately.

4. Advanced Modeling Techniques:

Experiment with more complex models, such as Gradient Boosting Machines (GBM), XGBoost, or neural networks, to capture intricate patterns in the data. Evaluate these models against the current ones to determine if they offer significant improvements in performance.

5. Model Interpretability:

Utilize model interpretability tools like SHAP (SHapley Additive exPlanations) to understand the contribution of each feature to the model’s predictions. This will provide deeper insights into the driving factors behind each cluster and help refine the model further.

6. Hyperparameter Tuning and Optimization:

Continuously refine hyperparameter tuning using techniques like Bayesian Optimization or Random Search to find the best model configurations. This ensures that the models are performing at their optimal settings.

7. Continuous Monitoring and Evaluation:

Implement a system for continuous monitoring of model performance. This includes setting up pipelines for real-time data updates and periodic re-evaluation of models to ensure they adapt to changing sales patterns. Regularly review model metrics and update models as needed.

8. Visualization and Reporting:

Enhance the visualization and reporting of results. Develop interactive dashboards in Tableau that allow stakeholders to explore data by clusters, visualize feature importance, and understand model predictions. Regularly update these dashboards to reflect the latest insights and trends.
By following this extended modeling plan, we can ensure that the sales prediction models are robust, accurate, and adaptable to new data, providing actionable insights for strategic decision-making. This comprehensive approach will help in understanding the key factors driving sales, enabling targeted marketing strategies, optimized inventory management, and ultimately improving overall sales performance.