In [431]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Create necessary directories
os.makedirs('data', exist_ok=True)
os.makedirs('figures', exist_ok=True)

# Read the existing CSV file
df = pd.read_csv('data/watch-history.csv')  # Adjust filename if different
df['datetime'] = pd.to_datetime(df['datetime'])

# Verify data loading
print("DataFrame shape:", df.shape)
print("\nSample of loaded data:")
print(df.head())
print("\nDataFrame info:")
print(df.info())

DataFrame shape: (24175, 4)

Sample of loaded data:
                                               title  \
0  I HATE This Coding Question, but FAANG Loves i...   
1                      O Mu Bu Mu Kış İçkileri ❄️   
2  Neden böyleyiz? | Cem Yılmaz : Diamond Elite ...   
3                                Home office reset ✨   
4  RP | Adobe Illustrator Logo Design 💫 #logodesi...   

                                          link  \
0  https://www.youtube.com/watch?v=zs1i2rh8Skk   
1  https://www.youtube.com/watch?v=_WR3YqxqcMA   
2  https://www.youtube.com/watch?v=X8eOsXLdEdI   
3  https://www.youtube.com/watch?v=Byq0t5QHnQE   
4  https://www.youtube.com/watch?v=9vu93tPwTts   

                            timestamp            datetime  
0  Dec 16, 2024, 2:06:10 PM GMT+03:00 2024-12-16 14:06:10  
1  Dec 16, 2024, 2:05:10 PM GMT+03:00 2024-12-16 14:05:10  
2  Dec 16, 2024, 2:04:04 PM GMT+03:00 2024-12-16 14:04:04  
3  Dec 16, 2024, 2:04:00 PM GMT+03:00 2024-12-16 14:04:00  
4  Dec 16,

In [432]:
import calendar
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

class TemporalAnalyzer:

    def __init__(self, df):

        self.df = self._preprocess_data(df)
        plt.style.use('default')
        plt.rcParams.update({
            'figure.figsize': (12, 6),
            'font.size': 10
        })

    def _preprocess_data(self, df):
        
        df = df.copy()
        # Filter data to include only rows from 01-01-2024 onwards
        start_date = pd.Timestamp('2024-01-01')
        df = df[df['datetime'] >= start_date]
        
        df['date'] = df['datetime'].dt.date
        df['month'] = df['datetime'].dt.month
        df['month_name'] = df['datetime'].dt.month_name()
        df['year'] = df['datetime'].dt.year
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.day_name()
        
        # Calculate time differences for session analysis
        df = df.sort_values('datetime')
        df['time_diff'] = df['datetime'].diff()
        df['minutes_diff'] = df['time_diff'].dt.total_seconds() / 60
        # Define new session if gap is > 30 minutes
        df['new_session'] = df['minutes_diff'] > 30
        df['session_id'] = df['new_session'].cumsum()
        
        return df

In [433]:
def analyze_hourly_patterns(self, output_dir='figures'):
    
    os.makedirs(output_dir, exist_ok=True)
    
    # Calculate hourly averages
    hourly_counts = self.df.groupby('hour').size()
    hourly_avg = hourly_counts / self.df['date'].nunique()
    
    # Create hour labels (just numbers)
    hour_labels = {0: '12 AM', 1: '1 AM', 2: '2 AM', 3: '3 AM', 4: '4 AM', 5: '5 AM',
                   6: '6 AM', 7: '7 AM', 8: '8 AM', 9: '9 AM', 10: '10 AM', 11: '11 AM',
                   12: '12 PM', 13: '1 PM', 14: '2 PM', 15: '3 PM', 16: '4 PM', 17: '5 PM',
                   18: '6 PM', 19: '7 PM', 20: '8 PM', 21: '9 PM', 22: '10 PM', 23: '11 PM'}
    
    hourly_avg = hourly_avg.sort_index()
    hourly_avg.index = hourly_avg.index.map(hour_labels)
    
    # Modify color list: 12 PM to 11 PM is PM (12 AM to 11 AM is AM)
    colors = ['#781f70' if ('AM' in label) else '#cf88c9' for label in hourly_avg.index]
    
    plt.figure(figsize=(12, 6))
    ax = hourly_avg.plot(kind='bar', color=colors, width=0.8)
    
    # Set integer values on the y-axis
    ax.set_yticks(range(0, int(hourly_avg.max()) + 2, 1))  # Adjust range and step as needed
    ax.yaxis.set_tick_params(length=0)  # Remove tick marks, but keep the labels
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    
    plt.title('Average Number of Videos in Different Times of Day', pad=20)
    plt.xlabel('')
    plt.ylabel('')
    plt.xticks(rotation=0, ha='center')
    
    # Create custom legend patches
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='#781f70', label='AM'),
        Patch(facecolor='#cf88c9', label='PM')
    ]
    
    # Add legend with custom patches
    ax.legend(handles=legend_elements, 
              loc='upper right',
              frameon=False,
              bbox_to_anchor=(1.0, 1.0))
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/hourly_patterns.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return {'hourly_averages': hourly_avg}

TemporalAnalyzer.analyze_hourly_patterns = analyze_hourly_patterns


In [434]:
def analyze_daily_patterns(self, output_dir='figures'):
    
    os.makedirs(output_dir, exist_ok=True)
    
    day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    daily_counts = self.df.groupby('day_of_week').size()
    daily_avg = (daily_counts / self.df['date'].nunique() * 7).reindex(day_order)
    
    plt.figure(figsize=(12, 6))
    ax = daily_avg.plot(kind='bar', color='#781f70', width=0.6)
    
    ax.spines['top'].set_visible(False)
    ax.spines['bottom'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.spines['left'].set_visible(False)
    
    plt.title('Average Number of Videos Watched by Day of Week', pad=20)
    plt.xlabel('')
    plt.ylabel('')
    plt.xticks(rotation=0)
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/daily_patterns.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return {'daily_averages': daily_avg}

TemporalAnalyzer.analyze_daily_patterns = analyze_daily_patterns

In [435]:
def analyze_monthly_trends(self, output_dir='figures'):
    
    os.makedirs(output_dir, exist_ok=True)
    
    monthly_counts = self.df.groupby(['year', 'month_name']).size().reset_index()
    monthly_counts.columns = ['year', 'month', 'count']
    
    monthly_counts['month'] = pd.Categorical(monthly_counts['month'], 
                                             categories=calendar.month_name[1:], 
                                             ordered=True)
    monthly_counts.sort_values(['year', 'month'], inplace=True)
    
    plt.figure(figsize=(12, 6))
    plt.plot(monthly_counts['month'], monthly_counts['count'], marker='o', color='#781f70')
    plt.title('Monthly Viewing Trends')
    plt.ylabel('Number of Videos Watched')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    ax = plt.gca()
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.tight_layout()
    plt.savefig(f'{output_dir}/monthly_trends.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return {'monthly_counts': monthly_counts}

TemporalAnalyzer.analyze_monthly_trends = analyze_monthly_trends


In [436]:
def analyze_binge_patterns(self, output_dir='figures'):
    
    session_stats = self.df.groupby('session_id').agg({
        'datetime': ['count', 'min', 'max'],
        'hour': 'first'  
    })
    
    session_stats.columns = ['videos_count', 'start_time', 'end_time', 'start_hour']
    session_stats['is_binge'] = session_stats['videos_count'] > 5
    
    hourly_stats = []
    for hour in range(24):
        hour_sessions = session_stats[session_stats['start_hour'] == hour]
        if len(hour_sessions) > 0:
            binge_pct = (sum(hour_sessions['is_binge']) / len(hour_sessions) * 100)
        else:
            binge_pct = 0
        hourly_stats.append({
            'hour': hour,
            'binge_percentage': binge_pct
        })
    
    hourly_df = pd.DataFrame(hourly_stats)
    
    hour_labels = {
        0: '12 AM', 1: '1 AM', 2: '2 AM', 3: '3 AM', 4: '4 AM', 5: '5 AM',
        6: '6 AM', 7: '7 AM', 8: '8 AM', 9: '9 AM', 10: '10 AM', 11: '11 AM',
        12: '12 PM', 13: '1 PM', 14: '2 PM', 15: '3 PM', 16: '4 PM', 17: '5 PM',
        18: '6 PM', 19: '7 PM', 20: '8 PM', 21: '9 PM', 22: '10 PM', 23: '11 PM'
    }
    hourly_df['hour_label'] = hourly_df['hour'].map(hour_labels)
    
    plt.figure(figsize=(15, 8))
    ax = plt.gca()
    
    plt.plot(hourly_df['hour_label'], hourly_df['binge_percentage'], 
             marker='o', color='#9333ea', linewidth=2, markersize=6)
    
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.title('Percentage of Binge Sessions by Hour of Day', pad=30, fontsize=14)
    plt.xlabel('')
    plt.ylabel('')
    
    plt.xticks(rotation=0, ha='center')
    plt.grid(True, axis='y', linestyle='--', alpha=0.3)
    plt.ylim(20, 60)
    plt.tight_layout()
    
    plt.savefig(f'{output_dir}/binge_percentages.png', dpi=300, bbox_inches='tight')
    plt.close()
    
    return {
        'hourly_stats': hourly_df[['hour_label', 'binge_percentage']],
        'overall_metrics': {
            'overall_binge_percentage': (sum(session_stats['is_binge']) / len(session_stats)) * 100
        }
    }

TemporalAnalyzer.analyze_binge_patterns = analyze_binge_patterns

In [437]:
analyzer = TemporalAnalyzer(df)

analyzer.analyze_hourly_patterns()
analyzer.analyze_daily_patterns()
analyzer.analyze_monthly_trends()
analyzer.analyze_binge_patterns()


{'hourly_stats':    hour_label  binge_percentage
 0       12 AM         46.153846
 1        1 AM         48.684211
 2        2 AM         35.416667
 3        3 AM         44.827586
 4        4 AM         30.769231
 5        5 AM         50.000000
 6        6 AM         33.333333
 7        7 AM         36.363636
 8        8 AM         22.222222
 9        9 AM         38.461538
 10      10 AM         30.588235
 11      11 AM         32.631579
 12      12 PM         41.228070
 13       1 PM         34.545455
 14       2 PM         35.789474
 15       3 PM         38.709677
 16       4 PM         33.333333
 17       5 PM         40.476190
 18       6 PM         36.666667
 19       7 PM         35.593220
 20       8 PM         31.386861
 21       9 PM         42.622951
 22      10 PM         28.787879
 23      11 PM         32.456140,
 'overall_metrics': {'overall_binge_percentage': 36.551030668677726}}