In [None]:
import plotly.express as px  # Importing Plotly Express for easy plotting
import pandas as pd  # Importing pandas for data manipulation

# Load the penguins dataset
penguins = pd.read_csv('penguins.csv')  # Read the dataset from a CSV file

# Create a histogram for flipper_length_mm for each species
fig = px.histogram(penguins, x='flipper_length_mm', color='species', barmode='overlay')  # Create an overlay histogram

# Calculate mean and median for each species
mean_values = penguins.groupby('species')['flipper_length_mm'].mean()  # Calculate mean
median_values = penguins.groupby('species')['flipper_length_mm'].median()  # Calculate median

# Add vertical lines for mean and median
for species in penguins['species'].unique():  # Loop through each species
    fig.add_vline(x=mean_values[species], line_color='red', line_width=2, annotation_text='Mean', annotation_position='top')  # Add mean line
    fig.add_vline(x=median_values[species], line_color='blue', line_width=2, annotation_text='Median', annotation_position='top')  # Add median line

# Calculate range, interquartile range, and standard deviations
for species in penguins['species'].unique():  # Loop through each species
    species_data = penguins[penguins['species'] == species]  # Filter data for the species
    range_value = species_data['flipper_length_mm'].max() - species_data['flipper_length_mm'].min()  # Calculate range
    iqr_value = species_data['flipper_length_mm'].quantile(0.75) - species_data['flipper_length_mm'].quantile(0.25)  # Calculate IQR
    mean_value = species_data['flipper_length_mm'].mean()  # Calculate mean
    std_dev = species_data['flipper_length_mm'].std()  # Calculate standard deviation

    # Add rectangles for range and IQR
    fig.add_shape(type='rect', x0=species_data['flipper_length_mm'].min(), x1=species_data['flipper_length_mm'].max(), y0=0, y1=100, fillcolor='green', opacity=0.2, line_width=0)  # Add range rectangle
    fig.add_shape(type='rect', x0=species_data['flipper_length_mm'].quantile(0.25), x1=species_data['flipper_length_mm'].quantile(0.75), y0=0, y1=100, fillcolor='orange', opacity=0.2, line_width=0)  # Add IQR rectangle
    fig.add_shape(type='rect', x0=mean_value - 2 * std_dev, x1=mean_value + 2 * std_dev, y0=0, y1=100, fillcolor='purple', opacity=0.2, line_width=0)  # Add standard deviation rectangle

# Show the figure
fig.show()  # Display the histogram

In [None]:
# main function that tests the histogram plotting
def main_function_histogram():
    # test the histogram plotting function
    # use case 1
    assert isinstance(penguins, pd.DataFrame)  # Check if the dataset is loaded as a DataFrame
    # use case 2
    assert 'flipper_length_mm' in penguins.columns  # Check if the column exists in the DataFrame

In [None]:
import pandas as pd  # Import pandas for data manipulation
import seaborn as sns  # Import seaborn for statistical data visualization
import matplotlib.pyplot as plt  # Import matplotlib for plotting

# Load the penguins dataset
penguins = pd.read_csv('penguins.csv')  # Read the CSV file containing the penguins dataset

# Create a figure with 3 rows of plots
fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(10, 15))  # Create a 3-row subplot

# Define a list of species to plot
species = penguins['species'].unique()  # Get unique species from the dataset

# Loop through each species and create a KDE plot
for i, sp in enumerate(species):
    # Filter the dataset for the current species
    data = penguins[penguins['species'] == sp]  # Select data for the current species
    # Create a KDE plot for flipper_length_mm
    sns.kdeplot(data['flipper_length_mm'], ax=axes[i], fill=True)  # Plot KDE with fill
    axes[i].set_title(f'KDE Plot for {sp}')  # Set the title for each subplot
    axes[i].set_xlabel('Flipper Length (mm)')  # Set x-axis label
    axes[i].set_ylabel('Density')  # Set y-axis label

plt.tight_layout()  # Adjust layout to prevent overlap
plt.show()  # Display the plots

In [None]:
# main function that tests the KDE plotting
def main_kde_plot():
    # test the KDE plotting function
    # use case 1: Check if the function runs without errors
    try:
        penguins = pd.read_csv('penguins.csv')  # Attempt to load the dataset
        assert penguins is not None  # Ensure the dataset is loaded
    except Exception as e:
        print(f"Error loading dataset: {e}")

    # use case 2: Check if the number of unique species is correct
    species = penguins['species'].unique()  # Get unique species
    assert len(species) == 3  # Assuming there are 3 species in the dataset

main_kde_plot()

1. Descriptions of the Three Visualization Methods

- Box Plot:
  - A box plot summarizes the distribution of data based on five summary statistics: minimum, first quartile (Q1), median (Q2), third quartile (Q3), and maximum. It clearly shows the median, the interquartile range (IQR), and any potential outliers. This method is great for comparing distributions across different groups.

- Histogram:
  - A histogram displays the distribution of numerical data by dividing the data into bins (intervals) and counting the number of observations in each bin. It provides a visual representation of the frequency distribution and shape of the data. However, the choice of bin size can significantly affect the appearance of the histogram.

- Kernel Density Estimator (KDE):
  - A KDE plot is a smoothed version of the histogram that estimates the probability density function of a continuous random variable. It provides a continuous curve that represents the data distribution, allowing for a better understanding of the underlying patterns. The choice of bandwidth can influence the smoothness of the curve.

 2. Pros and Cons

- Box Plot:
  - Pros: 
    - Clearly shows median and quartiles.
    - Highlights outliers effectively.
    - Useful for comparing multiple groups.
  - Cons: 
    - Does not show the distribution shape.
    - Can be less informative for small datasets.

- Histogram:
  - Pros: 
    - Good for visualizing the frequency distribution.
    - Easy to understand and interpret.
  - Cons: 
    - Sensitive to bin size and placement.
    - Can obscure details in the data distribution.

- KDE:
  - Pros: 
    - Provides a smooth estimate of the distribution.
    - Captures underlying patterns better than histograms.
  - Cons: 
    - Sensitive to bandwidth selection.
    - May misrepresent data if bandwidth is not chosen appropriately.

 3. Personal Preference

Personally, I prefer the KDE plot because it provides a more nuanced view of the data distribution. The smoothness of the KDE allows for better identification of patterns and trends, which can be particularly useful in exploratory data analysis. However, I also recognize the value of box plots for their clarity in showing summary statistics and outliers, especially when comparing multiple groups.


Which datasets have similar means and similar variances?

Data B and Data D:
Which datasets have similar means but quite different variances?

Data A and Data B: Bot
Which datasets have similar variances but quite different means?

Data B and Data D: Both
Which datasets have quite different means and quite different variances?

Data A and Data C:

Right Skewness: In a right-skewed distribution, the tail on the right side is longer or fatter than the left side.
Left Skewness: In a left-skewed distribution, the tail on the left side is longer or fatter than the right side.
The mean is affected more by extreme values (outliers) than the median.
In a right-skewed distribution, the mean is typically greater than the median.
In a left-skewed distribution, the mean is typically less than the median.
This relationship occurs because the mean takes into account all values in the dataset, while the median only considers the middle value, making it less sensitive to outliers.

In [None]:
from scipy import stats
import pandas as pd
import numpy as np

sample1 = stats.gamma(a=2, scale=2).rvs(size=1000)
fig1 = px.histogram(pd.DataFrame({'data': sample1}), x="data")
# USE `fig1.show(renderer="png")` FOR ALL GitHub and MarkUs SUBMISSIONS

sample1.mean()
np.quantile(sample1, [0.5])  # median

sample2 = -stats.gamma(a=2, scale=2).rvs(size=1000)

In [None]:
import plotly.express as px

# Visualizing the first sample (right-skewed)
fig1 = px.histogram(pd.DataFrame({'data': sample1}), x="data")
fig1.add_vline(x=sample1.mean(), line_color='red', annotation_text='Mean', annotation_position='top right')
fig1.add_vline(x=np.quantile(sample1, [0.5])[0], line_color='blue', annotation_text='Median', annotation_position='top right')
fig1.show(renderer="png")

# Visualizing the second sample (left-skewed)
fig2 = px.histogram(pd.DataFrame({'data': sample2}), x="data")
fig2.add_vline(x=sample2.mean(), line_color='red', annotation_text='Mean', annotation_position='top right')
fig2.add_vline(x=np.quantile(sample2, [0.5])[0], line_color='blue', annotation_text='Median', annotation_position='top right')
fig2.show(renderer="png")

In [None]:
  import plotly.express as px

# Visualizing the first sample (right-skewed)
fig1 = px.histogram(pd.DataFrame({'data': sample1}), x="data")
fig1.add_vline(x=sample1.mean(), line_color='red', annotation_text='Mean', annotation_position='top right')
fig1.add_vline(x=np.quantile(sample1, [0.5])[0], line_color='blue', annotation_text='Median', annotation_position='top right')
fig1.show(renderer="png")

# Visualizing the second sample (left-skewed)
fig2 = px.histogram(pd.DataFrame({'data': sample2}), x="data")
fig2.add_vline(x=sample2.mean(), line_color='red', annotation_text='Mean', annotation_position='top right')
fig2.add_vline(x=np.quantile(sample2, [0.5])[0], line_color='blue', annotation_text='Median', annotation_position='top right')
fig2.show(renderer="png")

This extended code will create two histograms, one for each sample, with vertical lines indicating the mean and median.
By comparing the two plots, you can visually observe how the mean and median differ in right-skewed versus left-skewed distributions.
In summary, the relationship between mean and median in the context of skewness is crucial for understanding data distributions.
Right-skewed distributions have a mean greater than the median, while left-skewed distributions have a mean less than the median.
This is primarily due to the sensitivity of the mean to extreme values compared to the median.

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/manuelamc14/fast-food-Nutritional-Database/main/Tables/nutrition.csv")

# Inspect the first few rows and the column names
df.head()
df.columns
# Get basic summary statistics
df.describe()
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.histplot(df['Calories'], bins=30, kde=True, color='orange')
plt.title("Distribution of Calories in Fast Food Items")
plt.xlabel("Calories")
plt.ylabel("Frequency")
plt.show()
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Category', y='Calories')
plt.xticks(rotation=45, ha='right')
plt.title("Calories by Category")
plt.show()
plt.figure(figsize=(8, 5))
sns.scatterplot(data=df, x='Calories', y='Sodium', hue='Category', palette='Set1', alpha=0.7)
plt.title("Sodium vs. Calories by Category")
plt.show()


Step 1: Load and Inspect the Dataset
Step 2: Summary Statistics
Step 3: Visualizations
1. Distribution of Calories
2. Calories by Category
3. Relationship Between Sodium and Calories
Step 4: Additional Insights

In [None]:
import pandas as pd
import plotly.express as px

# Load and process the baby names dataset
bn = pd.read_csv('https://raw.githubusercontent.com/hadley/data-baby-names/master/baby-names.csv')

# Make boy and girl names distinct by adding 'sex' to the name
bn['name'] = bn['name'] + " " + bn['sex']

# Calculate rank for each name by year based on percentage prevalence
bn['rank'] = bn.groupby('year')['percent'].rank(ascending=False)
bn = bn.sort_values(['name', 'year'])

# Calculate percent change in prevalence from the last year
bn['percent change'] = bn['percent'].diff()
new_name = [True] + list(bn.name[:-1].values != bn.name[1:].values)
bn.loc[new_name, 'percent change'] = bn.loc[new_name, 'percent']

# Filter out uncommon names with prevalence below 0.001%
bn = bn[bn.percent > 0.001]

# Create the scatter plot
fig = px.scatter(bn, 
                 x="percent change", 
                 y="rank", 
                 animation_frame="year", 
                 animation_group="name",
                 size="percent", 
                 color="sex", 
                 hover_name="name", 
                 size_max=50, 
                 range_x=[-0.005, 0.005])

# Reverse the y-axis so that rank 1 appears at the top
fig.update_yaxes(autorange='reversed')

# Display the plot
fig.show(renderer="png")


x="percent change": The x-axis represents the year-over-year change in the percentage of babies given a specific name.
y="rank": The y-axis represents the rank of the name in a given year, with rank 1 being at the top.
size="percent": The size of the points is proportional to the percentage of babies with that name in the given year.
color="sex": Different colors are used to distinguish between male and female names.
animation_frame="year": The plot will animate over the years, showing how names change in rank and percentage.
size_max=50: Limits the maximum size of the points.
range_x=[-0.005, 0.005]: Limits the x-axis to the specified range for clearer visualization.

yes