# Importing libraries and dataset for algorithms


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### Importing dataset and reading the dataset

In [2]:
dataset = pd.read_csv("Supermart Grocery Sales - Retail Analytics Dataset.csv")
supermarket = dataset.copy()

#### Reading the firsdt five rows of the dataset

In [3]:
supermarket.head()

Unnamed: 0,Order ID,Customer Name,Category,Sub Category,City,Order Date,Quarters,Order Year,Region,Sales,Discount,Profit,State
0,OD1,Harish,Oil & Masala,Masalas,Vellore,11/8/2017,4,2017,North,1254,0.12,401.28,Tamil Nadu
1,OD2,Sudha,Beverages,Health Drinks,Krishnagiri,11/8/2017,4,2017,South,749,0.18,149.8,Tamil Nadu
2,OD3,Hussain,Food Grains,Atta & Flour,Perambalur,6/12/2017,2,2017,West,2360,0.21,165.2,Tamil Nadu
3,OD4,Jackson,Fruits & Veggies,Fresh Vegetables,Dharmapuri,10/11/2016,4,2016,South,896,0.25,89.6,Tamil Nadu
4,OD5,Ridhesh,Food Grains,Organic Staples,Ooty,10/11/2016,4,2016,South,2355,0.26,918.45,Tamil Nadu


## Applying label encoding to read the categorical data while changing it into numbers

In [4]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()

In [5]:
supermarket['Category'] = le.fit_transform(supermarket['Category'])
supermarket['Sub Category'] = le.fit_transform(supermarket['Sub Category'])
supermarket['Region'] = le.fit_transform(supermarket['Region'])
supermarket['Order Year'] = le.fit_transform(supermarket['Order Year'])

In [6]:
supermarket

Unnamed: 0,Order ID,Customer Name,Category,Sub Category,City,Order Date,Quarters,Order Year,Region,Sales,Discount,Profit,State
0,OD1,Harish,5,14,Vellore,11/8/2017,4,2,2,1254,0.12,401.28,Tamil Nadu
1,OD2,Sudha,1,13,Krishnagiri,11/8/2017,4,2,3,749,0.18,149.80,Tamil Nadu
2,OD3,Hussain,3,0,Perambalur,6/12/2017,2,2,4,2360,0.21,165.20,Tamil Nadu
3,OD4,Jackson,4,12,Dharmapuri,10/11/2016,4,1,3,896,0.25,89.60,Tamil Nadu
4,OD5,Ridhesh,3,18,Ooty,10/11/2016,4,1,3,2355,0.26,918.45,Tamil Nadu
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9989,OD9990,Sudeep,2,9,Madurai,12/24/2015,4,0,4,945,0.16,359.10,Tamil Nadu
9990,OD9991,Alan,0,1,Kanyakumari,7/12/2015,3,0,4,1195,0.26,71.70,Tamil Nadu
9991,OD9992,Ravi,3,20,Bodi,6/6/2017,2,2,4,1567,0.16,501.44,Tamil Nadu
9992,OD9993,Peer,5,22,Pudukottai,10/16/2018,4,3,4,1659,0.15,597.24,Tamil Nadu


# Linear Regression

#### Taking X as categorical data i.e "Category" and "Sub Category"
#### Taking y as numerical data i.e "Sales"

In [11]:
X = supermarket.iloc[:,9:10].values
y = supermarket.iloc[:,2:4].values

#### Training and testing the dataset

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X ,y, test_size = 0.2, random_state = 0)

#### Applying linear regression

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)         #Training the data

LinearRegression()

In [15]:
print(regressor.intercept_)

[ 3.10122576 10.95507997]


In [16]:
print(regressor.coef_)

[[-6.03121315e-05]
 [ 6.67756256e-05]]


In [17]:
y_pred = regressor.predict(X_test)

#### Finding the metric values

In [18]:
from sklearn import metrics
print('Mean Squared Error:', metrics.mean_squared_error(y_test,y_pred))
print('Root Mean Squared:', np.sqrt(metrics.mean_squared_error(y_test,y_pred)))
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test,y_pred))
print('R Squared:', metrics.r2_score(y_test,y_pred))

Mean Squared Error: 24.434359410850295
Root Mean Squared: 4.943112320274575
Mean Absolute Error: 3.7896719471865774
R Squared: -0.0008970778337450414


## Conclusion : This algorithm is isnt applicable due to low accuracy

# LOGISTIC REGRESSION

### Taking 'a' as the values of sales and profit
### Taking 'b' as the values of Category

In [39]:
a = supermarket.iloc[:,9:10].values
b = supermarket.iloc[:,2].values

In [40]:
a_train,a_test,b_train,b_test = train_test_split(a ,b, test_size = 0.2, random_state = 0)

In [41]:
sc = StandardScaler()
a_train = sc.fit_transform(a_train)
a_test = sc.transform(a_test)

In [42]:
clf = LogisticRegression()
clf.fit(a_train,b_train)

LogisticRegression()

In [43]:
b_pred = clf.predict(a_test)
print("Accuracy score:", accuracy_score(b_test,b_pred)*100,"%")

Accuracy score: 16.05802901450725 %


## Conclusion : This algorithm isnt applicable because of having less accuracy

# Decision Tree

### Taking 'X' as the values of sales and profit
### Taking 'y' as the values of Category

In [44]:
X = supermarket.iloc[:,9:10].values
y = supermarket.iloc[:,2].values

In [45]:
X

array([[1254],
       [ 749],
       [2360],
       ...,
       [1567],
       [1659],
       [1034]], dtype=int64)

In [46]:
y

array([5, 1, 3, ..., 3, 5, 3])

### Training and testing the dataset

In [47]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2,random_state=0)


In [48]:
X_train.shape

(7995, 1)

In [49]:
y_test.shape

(1999,)

### Applying Decision tree algorithm

In [50]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit(X_train,y_train)

DecisionTreeClassifier()

In [51]:
y_pred = clf.predict(X_test)

In [52]:
from sklearn import metrics

In [53]:
metrics.confusion_matrix(y_test,y_pred)

array([[83, 56, 49, 29, 27, 13, 23],
       [67, 59, 34, 36, 38, 28, 32],
       [64, 57, 47, 36, 33, 23, 24],
       [68, 56, 55, 41, 16, 18, 25],
       [75, 45, 44, 32, 30, 24, 21],
       [61, 50, 38, 31, 32, 25, 25],
       [66, 76, 48, 43, 27, 40, 29]], dtype=int64)

In [54]:
print("Accuracy Score: " ,metrics.accuracy_score(y_test,y_pred)*100,"%")

Accuracy Score:  15.70785392696348 %


In [55]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.17      0.30      0.22       280
           1       0.15      0.20      0.17       294
           2       0.15      0.17      0.16       284
           3       0.17      0.15      0.16       279
           4       0.15      0.11      0.13       271
           5       0.15      0.10      0.12       262
           6       0.16      0.09      0.11       329

    accuracy                           0.16      1999
   macro avg       0.16      0.16      0.15      1999
weighted avg       0.16      0.16      0.15      1999



## Conclusion : This algorithm isnt applicable because of having less accuracy

# SVM

In [56]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

### Taking 'X' as the values of sales and profit
### Taking 'y' as the values of Category

In [57]:
X = supermarket.iloc[:,9:12:2].values
y = supermarket.iloc[:,2].values

In [58]:
from sklearn import svm

### Training and testing the dataset

In [59]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [60]:
!pip install dtreeviz

Defaulting to user installation because normal site-packages is not writeable


In [61]:
from sklearn.ensemble import RandomForestClassifier
from dtreeviz.trees import dtreeviz

ImportError: cannot import name 'dtreeviz' from 'dtreeviz.trees' (C:\Users\user\AppData\Roaming\Python\Python39\site-packages\dtreeviz\trees.py)

## Conclusion : Jupyter is not supporting for dtreeviz 
## For Random forest we are attaching the googlecolab file
#                                  Thank You