In [156]:
#RETAIL RETURN RISK PREDICTIONS IN ML USING PYTHON#

In [157]:
#import packages 

import numpy as np  
import pandas as pd
import matplotlib.pyplot as plt #import it for graph plotting

from sklearn.model_selection import train_test_split #import it for model selection 
from sklearn.preprocessing import StandardScaler #import it as a preprocessing tool that standaridizes features by removing the mean and scaling to unit variance
from sklearn.linear_model import LogisticRegression #import it, so we can use logistic regression 
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report #import accuracy score, confusion matrix and classification report for predictions


In [158]:
#read the data and show the first 10 rows of the data

record = pd.read_csv ("retail_data.csv") #read the retail_data (csv value) to get the data
record.head(10) #show the first 10 rows of the data

Unnamed: 0,Order_ID,Order_Date,Ship_Date,Customer_ID,Customer_Name,Age,Gender,City,State,Country,...,Unit_Price,Discount,Payment_Mode,Shipping_Cost,Delivery_Days,Returned,Customer_Rating,Total_sales,profit 30%,customer lifetime basic
0,10001,1/3/2024,1/6/2024,C001,John Miller,34,Male,Chicago,Illinois,USA,...,120,0.1,Credit Card,12,3,No,4,216.0,48.0,324.0
1,10002,1/5/2024,1/10/2024,C002,Sarah Lee,29,Female,Dallas,Texas,USA,...,350,0.05,Debit Card,25,5,No,5,332.5,87.5,498.75
2,10003,1/7/2024,1/12/2024,C003,Michael Brown,42,Male,New York,New York,USA,...,1200,0.15,UPI,30,5,No,5,1020.0,180.0,1530.0
3,10004,1/8/2024,1/15/2024,C004,Emma Davis,37,Female,Los Angeles,California,USA,...,90,0.2,Credit Card,15,7,Yes,3,216.0,27.0,324.0
4,10005,1/10/2024,1/13/2024,C005,David Wilson,31,Male,Houston,Texas,USA,...,75,0.05,Cash,8,3,No,4,142.5,37.5,213.75
5,10006,1/12/2024,1/18/2024,C006,Olivia Taylor,26,Female,Phoenix,Arizona,USA,...,450,0.1,Credit Card,18,6,No,4,405.0,90.0,607.5
6,10007,1/15/2024,1/20/2024,C007,Daniel Martinez,45,Male,Seattle,Washington,USA,...,600,0.12,Debit Card,35,5,No,5,528.0,108.0,792.0
7,10008,1/18/2024,1/25/2024,C008,Sophia Anderson,33,Female,Miami,Florida,USA,...,110,0.1,UPI,14,7,Yes,3,198.0,44.0,297.0
8,10009,1/20/2024,1/23/2024,C009,James Thomas,39,Male,Boston,Massachusetts,USA,...,900,0.08,Credit Card,20,3,No,5,828.0,198.0,1242.0
9,10010,1/25/2024,1/30/2024,C010,Ava Jackson,28,Female,Denver,Colorado,USA,...,60,0.15,Debit Card,10,5,Yes,2,153.0,27.0,229.5


In [159]:
#view basic info and missing values
print (record.info()) #it shows the data types of each column, the number of non-null values, and the memory usage of the DataFrame.
print (record.isnull().sum()) #it shows the number of missing values in each column. It counts the number of null (NaN) values in each column and returns a Series with the column names as the index and the count of missing values as the values.

<class 'pandas.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Order_ID                  10 non-null     int64  
 1   Order_Date                10 non-null     str    
 2   Ship_Date                 10 non-null     str    
 3   Customer_ID               10 non-null     str    
 4   Customer_Name             10 non-null     str    
 5   Age                       10 non-null     int64  
 6   Gender                    10 non-null     str    
 7   City                      10 non-null     str    
 8   State                     10 non-null     str    
 9   Country                   10 non-null     str    
 10  Segment                   10 non-null     str    
 11  Product_Category          10 non-null     str    
 12  Product_Name              10 non-null     str    
 13  Quantity                  10 non-null     int64  
 14  Unit_Price              

In [160]:
#remove duplicates
record.drop_duplicates(inplace=True) #it removes duplicate rows from the DataFrame. The inplace=True argument modifies the original DataFrame directly, rather than creating a new one. After this operation, any duplicate rows in the DataFrame will be removed, and only unique rows will remain.
print(f"Duplicates remaining: {record.duplicated().sum()}") #it checks for any remaining duplicate rows in the DataFrame after the drop_duplicates operation. The record.duplicated() method returns a boolean Series indicating whether each row is a duplicate of a previous row. The sum() function then counts the number of True values in this Series, which represents the number of duplicate rows remaining in the DataFrame. The result is printed to the console.

Duplicates remaining: 0


In [161]:
#Convert dates to numeric values because scikit-learn's logistic regression can't handle datetime objects directly
record['Order_Date'] = pd.to_datetime(record['Order_Date']).astype('int64') // 10**9 
record['Ship_Date'] = pd.to_datetime(record['Ship_Date']).astype('int64') // 10**9 
#Encode categorical IDs as numbers
record['Customer_ID'] = record['Customer_ID'].astype('category').cat.codes #to convert it to numeric values by encoding it as numeric codes
record['Order_ID'] = record['Order_ID'].astype('category').cat.codes #same as above
record.dropna(inplace=True) #to drop rows with any remaining missing values

In [162]:
#remove outliers (ex:- using z-score or IQR method)
from scipy import stats #import stats from scipy to use z-score method for outlier detection
record = record[(np.abs(stats.zscore(record.select_dtypes(include='number'))) < 3).all(axis=1)] #it removes outliers from the DataFrame using the z-score method. The stats.zscore() function calculates the z-score for each numeric column in the DataFrame, which measures how many standard deviations a data point is from the mean. The np.abs() function takes the absolute value of the z-scores, and the condition < 3 checks if the absolute z-score is less than 3, which is a common threshold for identifying outliers. The .all(axis=1) method ensures that only rows where all numeric columns have an absolute z-score less than 3 are retained in the DataFrame.

In [163]:
#save the clean version of the dataset
record.to_csv('retail_data_cleaned.csv', index=False) #it saves the cleaned DataFrame to a new CSV file named 'retail_data_cleaned.csv'. The index=False argument prevents pandas from writing row indices to the CSV file, resulting in a cleaner output that only includes the data columns.

In [164]:
x = record[["Order_ID","Order_Date","Ship_Date","Customer_ID","Quantity","Unit_Price","Discount","Delivery_Days"]] #it selects the specified columns from the DataFrame and assigns them to the variable x. These columns are likely the features that will be used for machine learning model training and prediction.
#col_name= ["feature_names"]
y = record["Returned"] #it selects the "Returned" column from the DataFrame and assigns it to the variable y. This column is likely the target variable that indicates whether a retail return occurred (e.g., 1 for returned, 0 for not returned) and will be used for training the machine learning model to predict return risk.
print(x) #it prints the contents of the variable x, which contains the selected feature columns from the DataFrame. This allows you to verify that the correct columns have been selected and to inspect the data that will be used for model training.
print(y) #it prints the contents of the variable y, which contains the target variable ("Returned") from the DataFrame. This allows you to verify that the correct target variable has been selected and to inspect the data that will be used for model training and prediction.

   Order_ID  Order_Date  Ship_Date  Customer_ID  Quantity  Unit_Price  \
0         0     1704240    1704499            0         2         120   
1         1     1704412    1704844            1         1         350   
2         2     1704585    1705017            2         1        1200   
3         3     1704672    1705276            3         3          90   
4         4     1704844    1705104            4         2          75   
5         5     1705017    1705536            5         1         450   
6         6     1705276    1705708            6         1         600   
7         7     1705536    1706140            7         2         110   
8         8     1705708    1705968            8         1         900   
9         9     1706140    1706572            9         3          60   

   Discount  Delivery_Days  
0      0.10              3  
1      0.05              5  
2      0.15              5  
3      0.20              7  
4      0.05              3  
5      0.10           

In [165]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.35, random_state=42) #it splits the dataset into training and testing sets. The x variable contains the features, and the y variable contains the target variable. The test_size=0.35 argument specifies that 35% of the data will be used for testing, while the remaining 65% will be used for training. The random_state=42 argument ensures that the split is reproducible, meaning that the same random split will occur each time the code is run with this seed value. The resulting x_train, x_test, y_train, and y_test variables contain the respective training and testing data for features and target variable.

In [166]:
LR= LogisticRegression() #it creates an instance of the LogisticRegression class and assigns it to the variable LR. This instance will be used to fit the logistic regression model to the training data and make predictions on the test data.
LR.fit(x_train,y_train) #it fits the logistic regression model to the training data. The fit() method takes the training features (x_train) and the corresponding target variable (y_train) as input and trains the model to learn the relationship between the features and the target variable. After this step, the LR variable contains a trained logistic regression model that can be used for making predictions on new data.

0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [167]:
# Generate predictions and probabilities, save results to CSV
y_pred = LR.predict(x_test) #it uses the trained logistic regression model (LR) to make predictions on the test features (x_test). The predict() method returns an array of predicted class labels (e.g., 0 or 1) based on the input features. The resulting y_pred variable contains the predicted class labels for the test set, which can be compared to the actual labels (y_test) to evaluate the model's performance.
y_prob = LR.predict_proba(x_test) #it uses the trained logistic regression model (LR) to predict the probabilities of each class for the test features (x_test). The predict_proba() method returns an array of shape (n_samples, n_classes) where each element represents the probability of the sample belonging to each class. For binary classification, y_prob[:, 1] will give the probabilities of the positive class (e.g., returned), which can be used for further analysis or thresholding to make final predictions.
import pandas as pd #it imports the pandas library, which is a powerful data manipulation and analysis library in Python. It provides data structures like DataFrames that allow for easy handling of structured data. In this context, pandas is likely being used to create a DataFrame to store the actual labels, predicted labels, and predicted probabilities, and to save this information to a CSV file for further analysis or reporting.
results = pd.DataFrame({ #it creates a new DataFrame called results using the pandas library. The DataFrame is constructed from a dictionary where the keys are the column names ("Actual", "Predicted", "Probability") and the values are the corresponding data for each column. The "Actual" column contains the true labels from y_test, the "Predicted" column contains the predicted labels from y_pred, and the "Probability" column contains the predicted probabilities of the positive class from y_prob[:, 1]. This DataFrame will allow for easy comparison of the actual and predicted values, as well as the associated probabilities, and can be saved to a CSV file for further analysis or reporting.
    "Actual": y_test.values, #it assigns the actual labels from the y_test variable to the "Actual" column of the results DataFrame. The .values attribute is used to extract the underlying numpy array from the pandas Series, ensuring that the data is in a format suitable for creating the DataFrame.
    "Predicted": y_pred, #it assigns the predicted labels from the y_pred variable to the "Predicted" column of the results DataFrame. This allows for a direct comparison between the actual labels (y_test) and the predicted labels (y_pred) in the resulting DataFrame.
    "Probability": y_prob[:, 1], #it assigns the predicted probabilities of the positive class (e.g., returned) from the y_prob variable to the "Probability" column of the results DataFrame. The y_prob[:, 1] expression selects the probabilities for the positive class, which can be used for further analysis or thresholding to make final predictions.
})
results.to_csv("return_predictions.csv", index=False) #it saves the results DataFrame, which contains the actual labels, predicted labels, and predicted probabilities, to a new CSV file named "return_predictions.csv". The index=False argument prevents pandas from writing row indices to the CSV file, resulting in a cleaner output that only includes the data columns.
print(results.head()) #it prints the first few rows of the results DataFrame, which contains the actual labels, predicted labels, and predicted probabilities. This allows you to quickly inspect the contents of the DataFrame and verify that the data has been correctly organized before saving it to a CSV file.

  Actual Predicted   Probability
0     No        No  3.206299e-15
1     No        No  9.060714e-02
2     No       Yes  7.893072e-01
3     No        No  1.998465e-04


In [168]:
print(y_test) #it prints the actual labels from the y_test variable, which contains the true class labels for the test set. This allows you to verify the actual values that will be compared against the predicted labels (y_pred) to evaluate the model's performance.

8    No
1    No
5    No
0    No
Name: Returned, dtype: str


In [169]:
LR.score(x_test,y_test)*100 #it calculates the accuracy of the logistic regression model (LR) on the test set. The score() method computes the mean accuracy by comparing the predicted labels (y_pred) with the true labels (y_test). The result is multiplied by 100 to convert it to a percentage, which represents the accuracy of the model in correctly predicting the class labels for the test set.

75.0

In [170]:
results.to_csv("return_predictions.csv", index=False) #it saves the results DataFrame, which contains the actual labels, predicted labels, and predicted probabilities, to a new CSV file named "return_predictions.csv". The index=False argument prevents pandas from writing row indices to the CSV file, resulting in a cleaner output that only includes the data columns. This allows for easy sharing and analysis of the prediction results in a structured format.
print(results.head()) #it prints the first few rows of the results DataFrame, which contains the actual labels, predicted labels, and predicted probabilities. This allows you to quickly inspect the contents of the DataFrame and verify that the data has been correctly organized before saving it to a CSV file. It provides a snapshot of the prediction results for further analysis or reporting.

  Actual Predicted   Probability
0     No        No  3.206299e-15
1     No        No  9.060714e-02
2     No       Yes  7.893072e-01
3     No        No  1.998465e-04
