In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load the Chipotle dataset
url = 'https://raw.githubusercontent.com/justmarkham/DAT8/master/data/chipotle.tsv'
chipotle_df = pd.read_csv(url, sep='\t')

# Dataset Exploration
# Display basic information about the dataset
print(chipotle_df.info())

# Display the first few rows of the dataset
print(chipotle_df.head())

# Visualize the dataset
plt.figure(figsize=(10, 6))
sns.heatmap(chipotle_df.isnull(), cmap='viridis', cbar=False)
plt.title('Missing Values in the Chipotle Dataset')
plt.show()

In [None]:


# Identify missing values
print("Missing Values:")
print(chipotle_df.isnull().sum())

# Identify outliers (if applicable)
# For simplicity, let's focus on numeric columns for outliers detection
numeric_columns = chipotle_df.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
    # Calculate Z-scores for each numeric column
    z_scores = (chipotle_df[column] - chipotle_df[column].mean()) / chipotle_df[column].std()
    # Identify and print rows with Z-scores beyond a certain threshold (e.g., 3)
    outliers = chipotle_df[abs(z_scores) > 3]
    if not outliers.empty:
        print(f"Outliers in {column}:")
        print(outliers)

        # Visualize outliers
        plt.figure(figsize=(10, 6))
        sns.boxplot(x=chipotle_df[column])
        plt.title(f'Outliers in {column}')
        plt.show()

In [None]:
# Library Setup - Nothing to do here, assuming Pandas and Numpy are already installed

# Data Cleaning
# Handle missing values - In this case, there are no missing values, but if there were, you might choose to impute or remove them
# Example: chipotle_df = chipotle_df.dropna()

# Handle duplicate records
chipotle_df.drop_duplicates(inplace=True)

# Visualize duplicate records
plt.figure(figsize=(10, 6))
chipotle_df.duplicated().value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Distribution of Duplicate Records')
plt.xticks([0, 1], ['No Duplicates', 'Duplicates'], rotation=0)
plt.show()

In [None]:

# Data Transformation
# Example: Creating a new feature - Total price for each item
chipotle_df['total_price'] = chipotle_df['quantity'] * chipotle_df['item_price']

# Visualize the new feature
plt.figure(figsize=(10, 6))
sns.histplot(chipotle_df['total_price'], kde=True, color='skyblue')
plt.title('Distribution of Total Price')
plt.show()

# Handling categorical variables (if any)
# Example: Convert 'item_name' to a categorical variable
chipotle_df['item_name'] = chipotle_df['item_name'].astype('category')

# Visualize the distribution of 'item_name'
plt.figure(figsize=(12, 8))
sns.countplot(y=chipotle_df['item_name'], order=chipotle_df['item_name'].value_counts().index, palette='viridis')
plt.title('Distribution of Items')
plt.show()

In [None]:




# Handling Outliers
# Decide on an appropriate approach - You might choose to remove outliers or transform them
# Example: Remove outliers for the 'quantity' column
chipotle_df = chipotle_df[chipotle_df['quantity'] <= 10]

# Visualize the cleaned dataset after handling outliers
plt.figure(figsize=(10, 6))
sns.boxplot(x=chipotle_df['quantity'], color='salmon')
plt.title('Distribution of Quantity after Handling Outliers')
plt.show()

# Data Validation
# Perform sanity checks on key variables and relationships
# Example: Check if the total_price is calculated correctly
print("Sanity Check - Total Price:")
print(chipotle_df[['quantity', 'item_price', 'total_price']].head())

# Documentation
# Document the steps and provide explanations for any transformations or imputations made during the process

# Save the cleaned and transformed dataset if needed
# Example: chipotle_df.to_csv('cleaned_chipotle_data.csv', index=False)
