# Superstore Dataset Analysis

In [1]:
# Import dependencies

import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
from collections import Counter

# Python SQL toolkit and Object Relational Mapper
from sqlalchemy import create_engine
from sqlalchemy import inspect
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func
import psycopg2
# Database credentials
import config


The config file includes the credentials to get access to the database

# Extract data from database

- Set up connection with server where the database is stored (AWS).
- Create engine for the connection.
- Choose the table that contains the required information for the analysis.

In [2]:
# Connection string, user name and password on config.py
con_string=f'postgresql+psycopg2://{config.username}:{config.password}@superstoredatabase.c7uct1bmfzis.us-east-2.rds.amazonaws.com:5432/superstoredatabase'

In [4]:
# Create engine
engine = create_engine(con_string)

AttributeError: 'DataFrame' object has no attribute '_instantiate_plugins'

In [None]:
# Create session
session = Session(engine)

In [None]:
# Use pandas read_sql to get table superstore_complete
superstore_df = pd.read_sql("SELECT * FROM superstore_complete", engine)
superstore_df.head(10)


# Preprocessing

- Check for null values.
- Look at the data types of the dataframe and transform into the correct data types (ship_date, order_date).
- Look at sumary statistics for all the dataframe.
- Add columns for the analysis and Machine Learning model (week_day, Month_number, profit_classification).

In [None]:
# Check for null values
superstore_df[superstore_df.notnull()].count()


In [None]:
# Look at general statistics and size
print(f'Data size: {superstore_df.shape}')
superstore_df.describe()


In [None]:
# Look at the data types of the df
superstore_df.dtypes

In [None]:
# Modify datatypes of Order and Ship Date to datetime

superstore_df['order_date']=pd.to_datetime(superstore_df['order_date'])
superstore_df['ship_date']=pd.to_datetime(superstore_df['ship_date'])
superstore_df.dtypes

In [None]:
#Get day of the week from Order Date column (0=monday - 6=Sunday)

superstore_df['week_day']=pd.DatetimeIndex(superstore_df['order_date']).dayofweek
superstore_df.head()

In [None]:
#Get month from Order Date column (1=january - 12=December)

superstore_df['Month_number']=pd.DatetimeIndex(superstore_df['order_date']).month
superstore_df.head()

In [None]:
#Classify rows into profitable (1) and non-profitable (0)

superstore_df['profit_classification']= np.where(superstore_df['profit']<=0, 0, 1)
superstore_df

In [None]:
# Order Dataframe by order date
superstore_df=superstore_df.sort_values('order_date',ascending=True)
superstore_df.head()

# ***Machine Learning Model - Profitable and not profitable orders***

Using a supervised machine learning model we are going to find out through classification when an order will be profitable or not profitable using as outcome the column **profit_classification**

# Transform data to fit into Machine Learning Model

- Eliminate unnecesary columns.
- Encode classification columns with OneHotEncoder.
- Choose independent and dependent variables.

In [None]:
# Drop unnecesary columns

superstore_ML_df=superstore_df.drop(columns=['row_id',
                                          'order_id',
                                          'customer_id',
                                          'customer_name',
                                          'postal_code',
                                          'product_id',
                                          'product_name',
                                          'ship_date',
                                          'city',
                                          'state',
                                          'country',
                                          'order_date',
                                          'sub_category',
                                          'market',
                                          'profit',
                                            'person',
                                             'return'                                        
                                         ])
superstore_ML_df.head()

In [None]:
superstore_categories = superstore_ML_df.dtypes[superstore_ML_df.dtypes == "object"].index.tolist()
superstore_categories

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(superstore_ML_df[superstore_categories]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(superstore_categories)
encode_df.head()

In [None]:
# Merge one-hot encoded features and drop the originals
superstore_ML_df = superstore_ML_df.merge(encode_df,left_index=True, right_index=True)
superstore_ML_df = superstore_ML_df.drop(superstore_categories,1)
superstore_ML_df.head()

In [None]:
X=superstore_ML_df.copy()
X=superstore_ML_df.drop(columns=['profit_classification'])
X.head()

In [None]:
y=superstore_ML_df['profit_classification']
y

# Train and test Machine learning

- Split data into training and test usign the default values, 75% for training and 25 for testing..

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=25)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Logistic Regression

- Fit chosen algoithm (logistic regression) for classification
- Extract accuracy scores and confusion matrix to determine if the model is usefull or not

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=100,
                                random_state=25)

In [None]:
classifier.fit(X_train_scaled, y_train)

In [None]:
y_pred = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual Not-Profitable", "Profitable"], columns=["Predicted Not-Profitable", "Predicted Profitable"])

cm_df

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred)
print(acc_score)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

# ***Machine Learning Model - Determine if an order will be returned***


Using a supervised machine learning model we are going to find out through classification if an order could be returned using as outcome the column **return_Yes**

# Transform data to fit into Machine Learning Model

- Eliminate unnecesary columns.
- Encode classification columns with OneHotEncoder.
- Choose independent and dependent variables.

In [None]:
# Drop unnecesary columns
superstore_ML_GD=superstore_df.drop(columns=['row_id',
                                              'order_id',
                                              'profit_classification',
                                                 'profit',
                                                 'week_day',
                                                 'Month_number',
                                                 'market',
                                             'order_date',
                                             'ship_date',
                                             'customer_id',
                                             'customer_name',
                                             'city',
                                             'state',
                                             'postal_code',
                                             'product_id',
                                             'person',
                                             'country',
                                             'product_name',
                                             'sub_category',
                                             'sales'
                                              ])
superstore_ML_GD.head()

In [None]:
superstore_categories = superstore_ML_GD.dtypes[superstore_ML_GD.dtypes == "object"].index.tolist()
superstore_categories

In [None]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(superstore_ML_GD[superstore_categories]))

# Add the encoded variable names to the dataframe
encode_df.columns = enc.get_feature_names(superstore_categories)
encode_df.head()

In [None]:
# Merge one-hot encoded features and drop the originals
superstore_ML_GD = superstore_ML_GD.merge(encode_df,left_index=True, right_index=True)
superstore_ML_GD = superstore_ML_GD.drop(superstore_categories,1)
superstore_ML_GD.head()

In [None]:
# Define features set
X = superstore_ML_GD.copy()
X = X.drop(columns=["return_Yes","return_None"],axis=1)
X.head()

In [None]:
# Define target vector
y = superstore_ML_GD["return_Yes"].values
y

# Train and test Machine learning

- Split data into training and test usign the default values, 75% for training and 25 for testing..

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=25)

# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaler
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
#look at the size of the sample
Counter(y_train)

# SMOTE

 - Apply SMOTE to balance the training set as 5% = 1 and 95%=No

In [None]:
from imblearn.over_sampling import SMOTE
X_resampled,y_resampled=SMOTE(random_state=25,sampling_strategy='auto').fit_resample(X_train_scaled,y_train)

In [None]:
Counter(y_resampled)

# Balanced Random Forest

- Fit chosen algoithm (Random Forest) for classification.
- Extract accuracy scores and confusion matrix to determine if the model is usefull or not.
- Get feature importances to determine wigh variables influence the outcome the most.

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
brf=BalancedRandomForestClassifier(n_estimators=500,random_state=25)
brf_model=brf.fit(X_resampled,y_resampled)
brf_model

In [None]:
y_pred = brf_model.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

In [None]:
# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    cm, index=["Actual No-return", "Actual Return"],
    columns=["Predicted No-return", "Predicted Return"]
)

# Displaying results
cm_df

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# List the features sorted in descending order by feature importance
importances=brf_model.feature_importances_
feat_imp=sorted(zip(importances,X.columns),reverse=True)
feat_imp

In [None]:
features = sorted(zip(X.columns, importances), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(10,20)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()