In [2]:
import pandas as pd
import plotly.express as px
from plotly.figure_factory import create_distplot
import plotly.graph_objs as go

In [2]:
df = pd.read_csv('dataset/onlinefoods.csv')
df.head()

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Pin code,Output,Feedback,Unnamed: 12
0,20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,560001,Yes,Positive,Yes
1,24,Female,Single,Student,Below Rs.10000,Graduate,3,12.977,77.5773,560009,Yes,Positive,Yes
2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,560017,Yes,Negative,Yes
3,22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,560019,Yes,Positive,Yes
4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.985,77.5533,560010,Yes,Positive,Yes


Drop unneccessary columns: `Unamed: 12` and `Pin code`. `Pin code` is just an ID

In [3]:
df.drop(columns = ['Pin code','Unnamed: 12'], axis = 1, inplace = True)
df

Unnamed: 0,Age,Gender,Marital Status,Occupation,Monthly Income,Educational Qualifications,Family size,latitude,longitude,Output,Feedback
0,20,Female,Single,Student,No Income,Post Graduate,4,12.9766,77.5993,Yes,Positive
1,24,Female,Single,Student,Below Rs.10000,Graduate,3,12.9770,77.5773,Yes,Positive
2,22,Male,Single,Student,Below Rs.10000,Post Graduate,3,12.9551,77.6593,Yes,Negative
3,22,Female,Single,Student,No Income,Graduate,6,12.9473,77.5616,Yes,Positive
4,22,Male,Single,Student,Below Rs.10000,Post Graduate,4,12.9850,77.5533,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...
383,23,Female,Single,Student,No Income,Post Graduate,2,12.9766,77.5993,Yes,Positive
384,23,Female,Single,Student,No Income,Post Graduate,4,12.9854,77.7081,Yes,Positive
385,22,Female,Single,Student,No Income,Post Graduate,5,12.9850,77.5533,Yes,Positive
386,23,Male,Single,Student,Below Rs.10000,Post Graduate,2,12.9770,77.5773,Yes,Positive


Let's see an overview of the data

In [4]:
df.describe()

Unnamed: 0,Age,Family size,latitude,longitude
count,388.0,388.0,388.0,388.0
mean,24.628866,3.280928,12.972058,77.60016
std,2.975593,1.351025,0.044489,0.051354
min,18.0,1.0,12.8652,77.4842
25%,23.0,2.0,12.9369,77.565275
50%,24.0,3.0,12.977,77.5921
75%,26.0,4.0,12.997025,77.6309
max,33.0,6.0,13.102,77.7582


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age                         388 non-null    int64  
 1   Gender                      388 non-null    object 
 2   Marital Status              388 non-null    object 
 3   Occupation                  388 non-null    object 
 4   Monthly Income              388 non-null    object 
 5   Educational Qualifications  388 non-null    object 
 6   Family size                 388 non-null    int64  
 7   latitude                    388 non-null    float64
 8   longitude                   388 non-null    float64
 9   Output                      388 non-null    object 
 10  Feedback                    388 non-null    object 
dtypes: float64(2), int64(2), object(7)
memory usage: 33.5+ KB


There are no null values!

We're gonna explore all the columns in the dataset and see if we can find any interesting insights.
After which, we'd explore inter-column relationships.

In [6]:
from typing import Literal

In [7]:
def single_label_dist_plot(
    label,
    type_=Literal[
        "histogram", "bar chart", "boxplot", "violin chart", "kde plot", "pie"
    ],
) -> go.Figure:
    """
    Plots a distribution of the dataset.

    Args
        label: str
            The column name to be plotted.
        type_: str
            The type of the plot. It must be either 'histogram', 'bar chart', 'boxplot', 'violin chart', 'kde plot', or 'pie.

    Returns
        fig: plotly.graph_objs.Figure
            A plotly plot figure.
    """
    fig = go.Figure()
    yaxis_title = "Count"
    xaxis_title = label

    if type_ == "histogram":
        fig.add_trace(go.Histogram(x=df[label]))
    elif type_ == "bar chart":
        fig.add_trace(
            go.Bar(x=df[label].value_counts().index, y=df[label].value_counts().values)
        )
    elif type_ == "boxplot":
        fig.add_trace(go.Box(y=df[label], boxmean=True))
    elif type_ == "violin chart":
        fig.add_trace(go.Violin(y=df[label], box_visible=True, meanline_visible=True))
        yaxis_title = label
        xaxis_title = ""
    elif type_ == "kde plot":
        if type(df[label].iloc[0]) == str:
            return "Cannot plot a KDE plot for a non-numeric column."
        fig = create_distplot(
            [df[label].dropna().values], group_labels=[label], show_hist=False
        )
        yaxis_title = "Density"
    elif type_ == "pie":
        fig.add_trace(
            go.Pie(
                labels=df[label].value_counts().index,
                values=df[label].value_counts().values,
            )
        )
        yaxis_title = ""
        xaxis_title = ""
    else:
        raise ValueError(
            "Invalid type. It must be either 'histogram', 'bar chart', 'boxplot', 'violin chart', 'kde plot', or 'pie."
        )

    fig.update_layout(
        title=f"{type_.title()} of {label.title()} Distribution",
        xaxis_title=xaxis_title,
        yaxis_title=yaxis_title,
    )
    return fig

In [8]:
single_label_dist_plot("Occupation", "violin chart")

In [9]:
def bilable_dist_plot(
    label1,
    label2,
    type_=Literal["scatter", "line", "bar", "boxplot", "violin chart"],
) -> go.Figure:
    """
    Plots 2 labels againts each other from the dataset.

    Args
        label1: str
            The first column name to be plotted.
        label2: str
            The second column name to be plotted.
        type_: str
            The type of the plot. It must be either 'scatter', 'line', 'bar', 'boxplot', or 'violin chart'.

    Returns
        fig: plotly.graph_objs.Figure
            A plotly plot figure.
    """
    fig = go.Figure()
    yaxis_title = label1
    xaxis_title = label2

    if type_ == "scatter":
        fig.add_trace(go.Scatter(x=df[label1], y=df[label2], mode="markers"))
    elif type_ == "line":
        fig.add_trace(go.Scatter(x=df[label1], y=df[label2], mode="lines"))
    elif type_ == "bar":
        fig.add_trace(go.Bar(x=df[label1], y=df[label2]))
    elif type_ == "boxplot":
        fig.add_trace(go.Box(x=df[label1], y=df[label2], boxmean=True))
        yaxis_title = label2
        xaxis_title = label1
    elif type_ == "violin chart":
        fig.add_trace(
            go.Violin(
                x=df[label1], y=df[label2], box_visible=True, meanline_visible=True
            )
        )
        yaxis_title = label2
        xaxis_title = label1
    else:
        raise ValueError(
            "Invalid type. It must be either 'scatter', 'line', 'bar', 'boxplot', or 'violin chart."
        )
    fig.update_layout(
        title=f"{type_.title()} of {label1.title()} vs {label2.title()}",
        xaxis_title=xaxis_title,
        yaxis_title=yaxis_title,
    )
    return fig

In [10]:
bilable_dist_plot("Gender", "Marital Status", "violin chart")

In [72]:
def multiabel_dist_plot(
    label1,
    label2,
    label3,
    label4 = None,
) -> go.Figure:
    """
    Plots 3 labels againts each other from the dataset.

    Args
        label1: str
            The first column name to be plotted.
        label2: str
            The second column name to be plotted.
        label3: str
            The third column name to be plotted.
        label4: str
            The fourth column name to be plotted.
        type_: str
            The type of the plot. It must be either 'scatter', 'line', 'bar', 'boxplot', or 'violin chart'.

    Returns
        fig: plotly.graph_objs.Figure
            A plotly plot figure.
    """
    fig = px.histogram(
        df,
        x=label1,
        facet_row=label2,
        facet_col=label3,
        animation_frame=label4,
        title=f"{label1.title()} vs {label2.title()} vs {label3.title()}",
    )


    return fig

In [78]:
multiabel_dist_plot("Age", "Marital Status", "Gender")
