In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.graphics.gofplots import qqplot
import dedupe.variables
import dedupe
import os

In [None]:
df = pd.read_csv(
    "https://raw.githubusercontent.com/justmarkham/DAT8/refs/heads/master/data/chipotle.tsv",
    delimiter = '\t',
    encoding = 'utf-8',
    dtype=str
)

In [None]:
for col in df.columns:
    print(col, df[col].isnull().sum())

In [None]:
df['item_price'] = df['item_price'].replace({'\$': ''}, regex=True).replace({'\.': ','}, regex=True).astype(str)

In [None]:
df

In [None]:
features = [
    dedupe.variables.String("order_id"),
    dedupe.variables.String("quantity"),
    dedupe.variables.String("item_name"),
    dedupe.variables.String("choice_description", has_missing=True),
    dedupe.variables.String("item_price")
]

In [None]:
deduper = dedupe.Dedupe(features)

In [None]:
df[df.duplicated()].sort_values("item_name")

In [None]:
df

In [None]:
df["quantity"].mean()

In [None]:
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(20,6))
fig.suptitle("Features vs Class\n", size = 18)

axs[0].hist(df["quantity"], bins=60, linewidth = 0.5, edgecolor="White")
axs[0].set_title("quantity Dist")

axs[1].hist(df["item_price"], bins=60, linewidth = 0.5, edgecolor="White")
axs[1].set_title("item_price Dist")

## Outliers

In [None]:
def Zcore_outlier(df):
    outliers = []
    mean = np.mean(df)
    sd = np.std(df)
    for i in df:
        z = (i - mean) / sd
        if np.abs(z) > 3:
            outliers.append(i)
    return outliers

In [None]:
def box_plot(df):
    plt.title("Box plot")
    sns.boxplot(df)
    plt.show()

In [None]:
def qqplots(df):
    qqplot(df, line='s')
    plt.title("Normal QQPlot")
    plt.show()

In [None]:
def dist_plot(df):
    sns.histplot(df, kde=True, linewidth=0)
    plt.title("Distribution Plot")
    sns.despine()
    plt.show()

In [None]:
max(Zcore_outlier(df["item_price"])), df["item_price"].mean()

In [None]:
box_plot(df["item_price"])

In [None]:
qqplots(df["item_price"])

In [None]:
print(Zcore_outlier(df["quantity"]))

In [None]:
dist_plot(df["quantity"])

In [None]:
dummies = pd.get_dummies(df.item_name)

In [None]:
merged = pd.concat([df,dummies],axis=1)

In [None]:
merged