# Day 9: Instagram Stories Daily User Creation Patterns

You are a Product Analyst on the Instagram Stories team investigating story creation patterns. The team wants to understand the distribution of stories created by users daily. You will analyze user storytelling behavior to optimize engagement strategies.

In [None]:
import pandas as pd
import numpy as np

stories_data_data = [
  {
    "user_id": "user_001",
    "story_date": "2024-07-03",
    "story_count": 3
  },
  {
    "user_id": "user_001",
    "story_date": "2024-07-03",
    "story_count": 3
  },
  {
    "user_id": "user_001",
    "story_date": "2024-08-15",
    "story_count": 5
  },
  {
    "user_id": "user_001",
    "story_date": "2024-09-10",
    "story_count": 0
  },
  {
    "user_id": "user_001",
    "story_date": "2024-10-05",
    "story_count": 20
  },
  {
    "user_id": "user_001",
    "story_date": "07/15/2024",
    "story_count": 2
  },
  {
    "user_id": "user_002",
    "story_date": "2024-07-03",
    "story_count": 4
  },
  {
    "user_id": " user_002",
    "story_date": "2024-07-04",
    "story_count": 3
  },
  {
    "user_id": "user_002",
    "story_date": null,
    "story_count": 6
  },
  {
    "user_id": "user_002",
    "story_date": "2024-12-25",
    "story_count": 1
  },
  {
    "user_id": "user_002",
    "story_date": "2025-01-15",
    "story_count": 7
  },
  {
    "user_id": "user_002",
    "story_date": "2025-06-29",
    "story_count": 10
  },
  {
    "user_id": "user_003",
    "story_date": "2024-07-10",
    "story_count": 2
  },
  {
    "user_id": "user_003",
    "story_date": "2024-08-20",
    "story_count": 8
  },
  {
    "user_id": "user_003",
    "story_date": "2024-08-20",
    "story_count": 8
  },
  {
    "user_id": "user_003",
    "story_date": "2025-03-11",
    "story_count": 5
  },
  {
    "user_id": null,
    "story_date": "2025-03-12",
    "story_count": 3
  },
  {
    "user_id": "USER_003",
    "story_date": "2025-04-01",
    "story_count": 4
  },
  {
    "user_id": "user_004",
    "story_date": "2024-07-15",
    "story_count": 6
  },
  {
    "user_id": "user_004",
    "story_date": "2024-09-30",
    "story_count": 7
  },
  {
    "user_id": "user_004",
    "story_date": "2024/10/10",
    "story_count": 4
  },
  {
    "user_id": "user_004",
    "story_date": "2024-11-11",
    "story_count": 3
  },
  {
    "user_id": "user_004",
    "story_date": "2025-02-28",
    "story_count": 12
  },
  {
    "user_id": "user_004",
    "story_date": "2025-03-01",
    "story_count": 0
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-01",
    "story_count": 1
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-02",
    "story_count": 2
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-03",
    "story_count": 3
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-04",
    "story_count": 4
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-05",
    "story_count": null
  },
  {
    "user_id": "user_005",
    "story_date": "2024-08-06",
    "story_count": 5
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-01",
    "story_count": 9
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-02",
    "story_count": 10
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-03",
    "story_count": 9
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-04",
    "story_count": 50
  },
  {
    "user_id": "user_006",
    "story_date": "2024-09-05",
    "story_count": 8
  },
  {
    "user_id": "user_006",
    "story_date": null,
    "story_count": 7
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-10",
    "story_count": 4
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-11",
    "story_count": 4
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-12",
    "story_count": 4
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-13",
    "story_count": 3
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-14",
    "story_count": 2
  },
  {
    "user_id": "user_007",
    "story_date": "2024-10-15",
    "story_count": 1
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-01",
    "story_count": 11
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-02",
    "story_count": 12
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-03",
    "story_count": 13
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-04",
    "story_count": 14
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-05",
    "story_count": 15
  },
  {
    "user_id": "user_008",
    "story_date": "2025-01-06",
    "story_count": 0
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-01",
    "story_count": 1
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-02",
    "story_count": 2
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-03",
    "story_count": 3
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-04",
    "story_count": 4
  },
  {
    "user_id": "user_009",
    "story_date": "2024-12-05",
    "story_count": 5
  },
  {
    "user_id": "user_009",
    "story_date": "invalid_date",
    "story_count": 6
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-15",
    "story_count": 7
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-16",
    "story_count": 8
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-17",
    "story_count": 9
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-18",
    "story_count": 10
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-19",
    "story_count": 11
  },
  {
    "user_id": "user_010",
    "story_date": "2025-03-20",
    "story_count": 12
  }
]
stories_data = pd.DataFrame(stories_data_data)


## Question 1

Take a look at the data in the story_date column. Correct any data type inconsistencies in that column.

In [None]:
import pandas as pd
from io import StringIO

# Simulate loading the data (replace with actual CSV/DB load)
data = """user_id,story_date,story_count
user_001,2024-07-03,3
user_001,2024-07-03,3
user_001,2024-08-15,5
user_001,2024-09-10,0
user_001,2024-10-05,20
user_001,07/15/2024,2
user_002,2024-07-03,4
user_002,2024-07-04,3
user_002,,6
user_002,2024-12-25,1
user_002,2025-01-15,7
user_002,2025-06-29,10
user_003,2024-07-10,2
user_003,2024-08-20,8
user_003,2024-08-20,8
user_003,2025-03-11,5
2025-03-12,,3
USER_003,2025-04-01,4
user_004,2024-07-15,6
user_004,2024-09-30,7
user_004,2024/10/10,4
user_004,2024-11-11,3
user_004,2025-02-28,12
user_004,2025-03-01,0
user_005,2024-08-01,1
user_005,2024-08-02,2
user_005,2024-08-03,3
user_005,2024-08-04,4
user_005,2024-08-05,
user_005,2024-08-06,5
user_006,2024-09-01,9
user_006,2024-09-02,10
user_006,2024-09-03,9
user_006,2024-09-04,50
user_006,2024-09-05,8
user_006,,7
user_007,2024-10-10,4
user_007,2024-10-11,4
user_007,2024-10-12,4
user_007,2024-10-13,3
user_007,2024-10-14,2
user_007,2024-10-15,1
user_008,2025-01-01,11
user_008,2025-01-02,12
user_008,2025-01-03,13
user_008,2025-01-04,14
user_008,2025-01-05,15
user_008,2025-01-06,0
user_009,2024-12-01,1
user_009,2024-12-02,2
user_009,2024-12-03,3
user_009,2024-12-04,4
user_009,2024-12-05,5
user_009,invalid_date,6
user_010,2025-03-15,7
user_010,2025-03-16,8
user_010,2025-03-17,9
user_010,2025-03-18,10
user_010,2025-03-19,11
user_010,2025-03-20,12
"""

df = pd.read_csv(StringIO(data))

# Step 1: If a row has a date in user_id but NaN in story_date, fix it
mask_date_in_user_id = pd.to_datetime(df['user_id'], errors='coerce').notna() & df['story_date'].isna()
df.loc[mask_date_in_user_id, 'story_date'] = df.loc[mask_date_in_user_id, 'user_id']
df.loc[mask_date_in_user_id, 'user_id'] = None  # Or keep as NaN for cleaning later

# Step 2: Convert story_date to datetime, coercing errors to NaT
df['story_date'] = pd.to_datetime(df['story_date'], errors='coerce', infer_datetime_format=True)

print(df.dtypes)
print(df[['user_id', 'story_date']])

## Question 2

Calculate the 25th, 50th, and 75th percentiles of the number of stories created per user per day.

In [None]:
# Step 1: Ensure story_count is numeric
df['story_count'] = pd.to_numeric(df['story_count'], errors='coerce')

# Step 2: Group by user_id & story_date to get total per day
stories_per_day = (
    df.dropna(subset=['user_id', 'story_date'])  # remove rows with missing identifiers
      .groupby(['user_id', 'story_date'], as_index=False)['story_count']
      .sum()
)

# Step 3: Calculate percentiles
percentiles = stories_per_day['story_count'].quantile([0.25, 0.5, 0.75])

print(percentiles)

## Question 3

What percentage of users have had at least one day, where they posted more than 10 stories on that day?

In [None]:
# Step 1: Group by user/day
stories_per_day = (
    df.dropna(subset=['user_id', 'story_date'])
      .groupby(['user_id', 'story_date'], as_index=False)['story_count']
      .sum()
)

# Step 2: Users with at least one day > 10 stories
users_with_high_day = stories_per_day.loc[stories_per_day['story_count'] > 10, 'user_id'].unique()

# Step 3: Calculate percentage
total_users = df['user_id'].dropna().nunique()
percentage = (len(users_with_high_day) / total_users) * 100

print(f"{percentage:.2f}%")

Made with ❤️ by [Interview Master](https://www.interviewmaster.ai)