
# Combined Analysis â€” `final_fg_events`

This notebook merges, refactors and cleans code from your uploaded SIN notebooks.  
**What it does (high-level):**
- Loads the canonical dataset `/mnt/data/final_fg_events.csv` (the latest `final_fg_events`).
- Cleans & preprocesses timestamps and removes ignored/background packages.
- Provides reusable utility functions (grouping, plotting helpers, sleep detection, top-N app extraction).
- Runs a standard set of analyses & visualizations: average daily usage (with std), weekday totals, hourly peaks, top apps per-person and global, session & idle-gap analysis, hourly heatmap, and sleep detection timeline.
- Outputs are reproducible; cells are logically ordered and functions are reused where possible.

> NOTE: I intentionally omitted/ignored any ML/DL code that looked incorrect. This notebook focuses on *data cleaning, EDA and visualizations*.


In [None]:

# Imports & config
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import timedelta
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 160)
sns.set(style='whitegrid')
DATA_PATH = '/mnt/data/final_fg_events.csv'
print('Notebook created. Data path =', DATA_PATH)


In [None]:

# --- Load dataset (canonical) ---
df = pd.read_csv(DATA_PATH)
print('Raw rows:', len(df))
df.head(5)


In [None]:

# --- Preprocessing & canonicalization ---
def preprocess(df):
    df = df.copy()
    # Normalize column names (common names from your files)
    df.columns = [c.strip() for c in df.columns]
    # Common expected columns: person_id, start_dt, end_dt, event_name or event_package_name, usage_sec
    # Try to handle variations
    if 'event_package_name' in df.columns and 'event_name' not in df.columns:
        df = df.rename(columns={'event_package_name':'event_name'})
    if 'start_date' in df.columns and 'start_time' in df.columns and 'start_dt' not in df.columns:
        # combine if separated
        df['start_dt'] = df['start_date'].astype(str) + ' ' + df['start_time'].astype(str)
    if 'end_date' in df.columns and 'end_time' in df.columns and 'end_dt' not in df.columns:
        df['end_dt'] = df['end_date'].astype(str) + ' ' + df['end_time'].astype(str)
    # Parse datetimes where necessary
    for col in ['start_dt','end_dt']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')
    # If usage_sec missing, compute from timestamps when possible
    if 'usage_sec' not in df.columns or df['usage_sec'].isnull().all():
        if ('start_dt' in df.columns) and ('end_dt' in df.columns):
            df['usage_sec'] = (df['end_dt'] - df['start_dt']).dt.total_seconds()
    # Trim whitespace in event_name
    if 'event_name' in df.columns:
        df['event_name'] = df['event_name'].astype(str).str.strip()
    # Ensure person_id numeric
    if 'person_id' in df.columns:
        df['person_id'] = pd.to_numeric(df['person_id'], errors='coerce').astype('Int64')
    # Drop rows without start_dt or usage_sec
    df = df[~df['start_dt'].isna()].copy()
    df = df[~df['usage_sec'].isna()].copy()
    # Remove negative or zero durations
    df = df[df['usage_sec'] > 0].copy()
    # Derive useful cols
    df['date'] = df['start_dt'].dt.date
    df['hour'] = df['start_dt'].dt.hour
    df['weekday'] = df['start_dt'].dt.day_name()
    return df

df = preprocess(df)
print('After preprocess rows:', len(df))
df[['person_id','start_dt','end_dt','event_name','usage_sec']].head(5)


In [None]:

# --- Consolidated ignore list (system & background packages from your notebooks) ---
IGNORE_PACKAGES = [
    'android','dummy','device_locked_package','com.sarthak.usagetracker',
    # common OEM/system packages
    'com.android.launcher3','com.oppo.launcher','com.miui.home','com.coloros.alarmclock',
    'com.google.android.gms','com.google.android.packageinstaller','com.android.settings',
    'com.android.systemui','com.android.dialer','com.android.contacts','com.android.bluetooth',
    'com.vivo.globalsearch','com.vivo.hiboard','com.vivo.appstore','com.sec.android.app.launcher',
    'com.oplus.camera','com.heytap.browser','com.oppo.quicksearchbox',
    # placeholders / known noisy
    'com.sophos.networkagent','com.daemon.shelper','com.lbe.security.miui'
]
# Remove ignored packages if column exists
if 'event_name' in df.columns:
    initial = len(df)
    df = df[~df['event_name'].isin(IGNORE_PACKAGES)].copy()
    print('Dropped', initial - len(df), 'ignored-package rows. Remaining:', len(df))
else:
    print('event_name not found, skipping package filter.')


In [None]:

# --- Reusable utility functions ---

def group_usage(df, by, seconds_to='min'):
    '''Group by columns in `by` and return usage aggregated. seconds_to='min'|'hours'|'sec' '''
    temp = df.groupby(by)['usage_sec'].sum().reset_index()
    if seconds_to == 'min':
        temp['usage_min'] = temp['usage_sec'] / 60
    elif seconds_to == 'hours':
        temp['usage_hr'] = temp['usage_sec'] / 3600
    return temp

def top_n_apps(df, n=5, person_id=None):
    '''Return top-n apps by usage. If person_id given, filter first.'''
    sub = df if person_id is None else df[df['person_id']==person_id]
    out = sub.groupby('event_name')['usage_sec'].sum().reset_index().sort_values('usage_sec', ascending=False).head(n)
    out['usage_min'] = out['usage_sec']/60
    return out

def avg_daily_per_person(df):
    daily = group_usage(df, ['person_id','date'], seconds_to='sec')
    summary = daily.groupby('person_id')['usage_sec'].agg(['mean','std']).reset_index().rename(columns={'mean':'avg_daily_sec','std':'std_daily_sec'})
    summary['avg_daily_min'] = summary['avg_daily_sec']/60
    summary['std_daily_min'] = summary['std_daily_sec']/60
    return summary

def peak_hours(df):
    hour = group_usage(df, ['hour'], seconds_to='sec')
    hour = hour.sort_values('usage_sec', ascending=False)
    return hour

def detect_sleep_periods(df, min_sleep_hours=3, ignore_small_min=45):
    '''
    Detect long inactivity gaps per person that likely represent sleep.
    Returns DataFrame with person_id, date (based on night), sleep_time (end of last use), wake_time (next start), gap.
    '''
    data = df.sort_values(['person_id','start_dt']).copy()
    data['next_start'] = data.groupby('person_id')['start_dt'].shift(-1)
    data['gap'] = data['next_start'] - data['end_dt']
    MIN_SLEEP = pd.Timedelta(hours=min_sleep_hours)
    IGNORE_SMALL = pd.Timedelta(minutes=ignore_small_min)
    data['is_sleep_candidate'] = (data['gap'] >= MIN_SLEEP) & (data['end_dt'].dt.hour.isin(list(range(19,24))+list(range(0,5))))
    # pick largest gap per person-date (date anchored to end_dt date)
    data['date'] = data['end_dt'].dt.date
    sleep_df = data[data['is_sleep_candidate']].sort_values(['person_id','date','gap'], ascending=[True,True,False])
    sleep_df = sleep_df.groupby(['person_id','date']).head(1).reset_index(drop=True)
    # rename columns
    sleep_df = sleep_df.rename(columns={'end_dt':'sleep_time','next_start':'wake_time'})
    # calculate durations
    sleep_df['sleep_duration'] = sleep_df['wake_time'] - sleep_df['sleep_time']
    # filter typical wake before afternoon
    sleep_df = sleep_df[sleep_df['wake_time'].dt.hour <= 15]
    return sleep_df[['person_id','date','sleep_time','wake_time','sleep_duration','gap']]


In [None]:

# --- Core analyses & visualizations (re-usable flow) ---

# 1) Average daily per person (with std)
summary = avg_daily_per_person(df)
summary = summary.sort_values('avg_daily_min', ascending=False).reset_index(drop=True)
display(summary.head(10))

# Matplotlib bar (avg daily minutes with std)
plt.figure(figsize=(12,5))
plt.bar(summary['person_id'].astype(str), summary['avg_daily_min'], yerr=summary['std_daily_min'], capsize=4)
plt.xlabel('Person ID'); plt.ylabel('Avg Daily Usage (minutes)'); plt.title('Average Daily Screen Time per Person (with std)')
plt.show()

# 2) Weekday totals (global)
weekday_totals = group_usage(df, ['weekday'], seconds_to='min')
# Order weekdays
weekday_order = ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
weekday_totals['weekday'] = pd.Categorical(weekday_totals['weekday'], categories=weekday_order, ordered=True)
weekday_totals = weekday_totals.sort_values('weekday')
plt.figure(figsize=(8,4))
sns.barplot(data=weekday_totals, x='weekday', y='usage_min', palette='magma')
plt.xticks(rotation=45); plt.ylabel('Total Usage (minutes)'); plt.title('Total Screen Time by Weekday (All Users)')
plt.show()

# 3) Hourly pattern (global)
hourly = group_usage(df, ['hour'], seconds_to='min')
plt.figure(figsize=(10,4))
sns.lineplot(data=hourly, x='hour', y='usage_min', marker='o')
plt.xlabel('Hour of Day'); plt.ylabel('Total Usage (minutes)'); plt.title('Hourly Screen Time (All Users)')
plt.grid(alpha=0.3); plt.show()

# 4) Top 5 apps globally
top5_global = top_n_apps(df, n=5, person_id=None)
display(top5_global)
plt.figure(figsize=(8,4))
sns.barplot(data=top5_global, x='usage_min', y='event_name', palette='viridis')
plt.xlabel('Usage (minutes)'); plt.title('Top 5 Most Used Apps (Global)'); plt.show()

# 5) Top 5 per person (Plotly grid for first 6 persons)
persons = sorted(df['person_id'].dropna().unique())[:6]
import plotly.subplots as psub
fig = psub.make_subplots(rows=2, cols=3, subplot_titles=[f'Person {p}' for p in persons])
r=1;c=1
for i,p in enumerate(persons):
    top5 = top_n_apps(df, n=5, person_id=p).sort_values('usage_min', ascending=True)
    fig.add_trace(go.Bar(x=top5['usage_min'], y=top5['event_name'], orientation='h', name=f'P{p}'), row=r, col=c)
    c += 1
    if c>3:
        r += 1; c = 1

fig.update_layout(height=700, width=1000, title_text='Top 5 Apps for Selected Persons (minutes)')
fig.show()

# 6) Hourly heatmap per person
hourly_p = df.groupby(['person_id','hour'])['usage_sec'].sum().reset_index()
heat = hourly_p.pivot(index='person_id', columns='hour', values='usage_sec').fillna(0)
plt.figure(figsize=(12,6))
sns.heatmap(heat/60, cmap='viridis')  # convert to minutes for readability
plt.title('Hourly Usage Heatmap (minutes)'); plt.xlabel('Hour'); plt.ylabel('Person ID')
plt.show()

# 7) Session behavior: sessions per day and avg session duration
session_count = df.groupby(['person_id','date']).size().reset_index(name='sessions')
avg_session = df.groupby('person_id')['usage_sec'].mean().reset_index(name='avg_session_sec')
print('Sessions per person (sample):'); display(session_count.head())
print('\nAvg session duration (sec) sample:'); display(avg_session.head())

# 8) Sleep detection
sleep_df = detect_sleep_periods(df)
display(sleep_df.head())


In [None]:

# --- Save notebook file path confirmation ---
nb_path = '/mnt/data/combined_final_fg_events_analysis.ipynb'
print('Notebook saved earlier to', nb_path)
