<a href="https://colab.research.google.com/github/Amna9191/DSA210_TermProject/blob/main/DSA210TermProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Extract Data from html file and save to csv

import csv
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime

# Function to export music history to a CSV file
def save_music_history_to_csv(music_history, output_file='music_history.csv'):
    headers = ['Song Title', 'Song URL', 'Artist', 'Date', 'Time']  # Added 'Date' and 'Time'

    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()  # Write the header row
        for entry in music_history:
            writer.writerow({
                'Song Title': entry['song_title'],
                'Song URL': entry['song_url'],
                'Artist': entry['channel_name'],
                'Date': entry['date'],
                'Time': entry['time']
            })

    print(f"Music history successfully exported to {output_file}")

# Function to load and parse the HTML file
def load_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
    return soup

# Function to extract YouTube Music history
def extract_youtube_music_history(soup):
    music_history = []

    music_entries = soup.find_all('div', class_='outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp')

    for entry in music_entries:
        header = entry.find('p', class_='mdl-typography--title')
        if header and 'YouTube Music' in header.text.strip():
            song_tag = entry.find('a', href=True)
            if song_tag:
                song_title = song_tag.text.strip()
                song_url = song_tag.get('href')

                channel_tag = song_tag.find_next('a', href=True)
                if channel_tag:
                    channel_name = channel_tag.text.strip().replace(" - Topic", "")

                timestamp_tag = entry.find('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')
                if timestamp_tag:
                    timestamp_full = timestamp_tag.text.strip()
                    match = re.search(r'((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2},\s\d{4}),\s([\d:]+\s[AP]M)\sGMT[+\-]\d{2}:\d{2}', timestamp_full)
                    if match:
                        date_full = match.group(1)  # Extract the full date
                        time_12hr = match.group(3)  # Extract the 12-hour format time

                        # Convert 12-hour format to 24-hour format
                        time_24hr = datetime.strptime(time_12hr, '%I:%M:%S %p').strftime('%H:%M:%S')
                        date = datetime.strptime(date_full, '%b %d, %Y').strftime('%d %b %Y')  # Format date to include day, month, and year
                    else:
                        date = "No date found"
                        time_24hr = "No time found"
                else:
                    date = "No date found"
                    time_24hr = "No time found"

                music_history.append({
                    'song_title': song_title,
                    'song_url': 'https://music.youtube.com' + song_url,
                    'channel_name': channel_name,
                    'date': date,
                    'time': time_24hr
                })

    return music_history

# Function to display or process the data
def print_music_history(music_history):
    for entry in music_history:
        print(f"Song Title: {entry['song_title']}")
        print(f"Song URL: {entry['song_url']}")
        print(f"Artist: {entry['channel_name']}")
        print(f"Date: {entry['date']}")
        print(f"Time: {entry['time']}")
        print("-" * 40)

In [2]:
# Function to filter songs by date range and save to a new CSV
def filter_songs_by_date_range(input_file, output_file, start_date, end_date):
    try:
        # Convert start and end dates to datetime objects
        start_date = datetime.strptime(start_date, '%d %b %Y')
        end_date = datetime.strptime(end_date, '%d %b %Y')

        filtered_songs = []

        # Read the input CSV file
        with open(input_file, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                song_date = datetime.strptime(row['Date'], '%d %b %Y')
                if start_date <= song_date <= end_date:
                    filtered_songs.append(row)

        # Write the filtered songs to the new CSV file
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=['Song Title', 'Song URL', 'Artist', 'Date', 'Time'])
            writer.writeheader()
            writer.writerows(filtered_songs)

        print(f"Filtered songs successfully saved to {output_file}")

    except Exception as e:
        print(f"An error occurred: {e}")


In [3]:
file_path = 'watch-history.html'  # Path to your HTML file

if os.path.exists(file_path):
    soup = load_html(file_path)  # Load and parse the HTML file
    music_history = extract_youtube_music_history(soup)  # Extract music history

    if music_history:
        #print_music_history(music_history)  # Display data (optional)
        save_music_history_to_csv(music_history, 'music_history.csv')  # Save to CSV
    else:
        print("No YouTube Music history found in the provided HTML file.")
else:
    print(f"File not found: {file_path}")

Music history successfully exported to music_history.csv


In [8]:
# Filter history by dates
filter_songs_by_date_range(
  input_file='music_history.csv',
  output_file='filtered_music_history.csv',
  start_date='01 Sep 2024',
  end_date='31 Dec 2024'
)

Filtered songs successfully saved to filtered_music_history.csv


In [9]:
import csv
from datetime import datetime

# Function to sort a CSV file by date and time
def sort_csv_by_date_and_time(input_file, output_file='sorted_music_history.csv'):
    try:
        # Read the CSV file
        with open(input_file, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)  # Read all rows as a list of dictionaries

        # Ensure 'Date' and 'Time' columns exist
        if 'Date' not in rows[0] or 'Time' not in rows[0]:
            print("The input CSV file must contain 'Date' and 'Time' columns.")
            return

        # Sort rows by date and time
        sorted_rows = sorted(
            rows,
            key=lambda x: (datetime.strptime(x['Date'], '%d %b %Y'), datetime.strptime(x['Time'], '%H:%M:%S'))
        )

        # Write the sorted data to a new CSV file
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=reader.fieldnames)
            writer.writeheader()  # Write the header
            writer.writerows(sorted_rows)  # Write sorted rows

        print(f"Sorted CSV successfully saved to {output_file}")

    except FileNotFoundError:
        print(f"File not found: {input_file}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
if __name__ == "__main__":
    input_csv = "filtered_music_history.csv"  # Replace with your file path
    sort_csv_by_date_and_time(input_csv, "sorted_music_history.csv")


Sorted CSV successfully saved to sorted_music_history.csv


In [10]:
from datetime import datetime, timedelta
import csv

# Function to calculate song duration based on the start time of the next song
def calculate_duration(start_time_str, next_start_time_str):
    # Parse the start times
    start_time = datetime.strptime(start_time_str, '%H:%M:%S')
    next_start_time = datetime.strptime(next_start_time_str, '%H:%M:%S')

    # Calculate the duration as the difference between the next song's start time and the current song's start time
    duration = next_start_time - start_time
    return duration

# Function to format timedelta as "minutes:seconds"
def format_duration(duration):
    # Convert timedelta to minutes and seconds format (MM:SS)
    total_seconds = int(duration.total_seconds())
    minutes = total_seconds // 60
    seconds = total_seconds % 60
    return f"{minutes}:{seconds:02d}"

# Function to calculate the average duration of valid durations
def calculate_average_duration(durations):
    total_duration = sum(durations, timedelta())
    average_duration = total_duration / len(durations)
    return average_duration

# Update the CSV processing to include a "Duration" column
def add_duration_to_csv(input_file, output_file='music_with_duration.csv'):
    try:
        # Read the CSV file
        with open(input_file, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)  # Read all rows as a list of dictionaries

        # Ensure 'Time' column exists
        if 'Time' not in rows[0]:
            print("The input CSV file must contain a 'Time' column.")
            return

        # Collect valid durations
        valid_durations = []
        updated_rows = []  # This will hold rows with valid durations

        for i in range(len(rows) - 1):
            current_song = rows[i]
            next_song = rows[i + 1]

            # If the song title is a URL, set it to "Unknown"
            if current_song['Song Title'].startswith('https'):
                current_song['Song Title'] = "Unknown"

            current_start_time = current_song['Time']
            next_start_time = next_song['Time']

            # Calculate the duration of the current song based on the next song's start time
            duration = calculate_duration(current_start_time, next_start_time)

            # Check if duration is valid
            if timedelta(seconds=0) <= duration <= timedelta(minutes=7):
                valid_durations.append(duration)

        # Calculate the average duration of valid durations
        average_duration = calculate_average_duration(valid_durations)

        for i in range(len(rows) - 1):
            current_song = rows[i]
            next_song = rows[i + 1]

            current_start_time = current_song['Time']
            next_start_time = next_song['Time']

            duration = calculate_duration(current_start_time, next_start_time)

            # Set invalid durations to average duration
            if duration > timedelta(minutes=7) or duration < timedelta(seconds=0):
                duration = average_duration

            formatted_duration = format_duration(duration)  # Format the duration as MM:SS

            current_song['Duration'] = formatted_duration
            updated_rows.append(current_song)

        # For the last song, set the duration as "Unknown"
        rows[-1]['Duration'] = "Unknown"
        updated_rows.append(rows[-1])  # Add the last song

        # Write the updated data to a new CSV file
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            fieldnames = reader.fieldnames + ['Duration']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()  # Write the header
            writer.writerows(updated_rows)  # Write updated rows

        print(f"CSV with song durations saved to {output_file}")

    except FileNotFoundError:
        print(f"File not found: {input_file}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
if __name__ == "__main__":
    input_csv = "sorted_music_history.csv"  # Replace with your file path
    add_duration_to_csv(input_csv, "music_with_duration.csv")


CSV with song durations saved to music_with_duration.csv


In [12]:
import csv
from datetime import timedelta

# Function to convert duration from 'MM:SS' to seconds
def duration_to_seconds(duration):
    try:
        minutes, seconds = map(int, duration.split(':'))
        return minutes * 60 + seconds
    except ValueError:
        return 0  # Skip invalid durations

# Initialize a dictionary to store listening time per day
listening_time_per_day = {}

# Read the CSV file
csv_file_path = "music_with_duration.csv"  # Replace with your actual file path
with open(csv_file_path, mode='r') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        date = row['Date']
        duration = row['Duration']
        seconds = duration_to_seconds(duration)

        if date in listening_time_per_day:
            listening_time_per_day[date] += seconds
        else:
            listening_time_per_day[date] = seconds

# Convert seconds back to hours, minutes, and seconds
listening_time = {
    date: str(timedelta(seconds=seconds))
    for date, seconds in listening_time_per_day.items()
}


Listening Time Per Day:
04 Sep 2024: 0:40:43
05 Sep 2024: 2:36:15
06 Sep 2024: 1:14:24
07 Sep 2024: 3:04:59
09 Sep 2024: 0:34:09
10 Sep 2024: 1:29:02
11 Sep 2024: 3:03:26
12 Sep 2024: 3:09:59
13 Sep 2024: 1:02:06
15 Sep 2024: 0:14:36
16 Sep 2024: 0:31:24
18 Sep 2024: 0:50:09
19 Sep 2024: 0:22:10
20 Sep 2024: 2:07:33
21 Sep 2024: 3:04:31
22 Sep 2024: 1:32:08
23 Sep 2024: 3:14:16
24 Sep 2024: 1:35:10
25 Sep 2024: 0:36:30
26 Sep 2024: 0:41:29
27 Sep 2024: 0:06:11
28 Sep 2024: 0:34:19
29 Sep 2024: 0:46:42
30 Sep 2024: 2:03:42
01 Oct 2024: 1:01:15
02 Oct 2024: 0:44:37
03 Oct 2024: 3:07:37
04 Oct 2024: 1:29:44
19 Oct 2024: 2:17:13
20 Oct 2024: 1:49:00
21 Oct 2024: 1:07:17
22 Oct 2024: 0:29:41
23 Oct 2024: 3:24:34
24 Oct 2024: 0:23:38
25 Oct 2024: 1:04:04
26 Oct 2024: 0:31:54
27 Oct 2024: 0:28:25
28 Oct 2024: 1:12:37
29 Oct 2024: 3:43:55
30 Oct 2024: 2:13:17
31 Oct 2024: 3:58:21
01 Nov 2024: 1:59:20
02 Nov 2024: 5:58:47
03 Nov 2024: 2:52:17
04 Nov 2024: 3:10:19
05 Nov 2024: 0:26:44
06 Nov 202

In [13]:
from datetime import datetime, timedelta

# Dictionary to store listening time per month
listening_time_per_month = {}

# Aggregate daily listening times into monthly listening times
for date, seconds in listening_time_per_day.items():
    # Parse the date and extract the month and year
    date_obj = datetime.strptime(date, '%d %b %Y')  # Adjust format to match your CSV's date format
    month_year = date_obj.strftime('%b %Y')  # Format as "Month Year" (e.g., "Jan 2024")

    # Add the daily listening time to the monthly total
    if month_year in listening_time_per_month:
        listening_time_per_month[month_year] += int(seconds)
    else:
        listening_time_per_month[month_year] = int(seconds)

# Convert seconds back to hours, minutes, and seconds for each month
listening_time_monthly = {
    month_year: str(timedelta(seconds=seconds))
    for month_year, seconds in listening_time_per_month.items()
}

# Print the listening time for each month
for month, time in listening_time_monthly.items():
    print(f"{month}: {time}")


Sep 2024: 1 day, 11:15:53
Oct 2024: 1 day, 5:07:09
Nov 2024: 1 day, 5:34:44
Dec 2024: 8:27:32


In [20]:
from datetime import datetime, timedelta

# Date when the decision was made
decision_date = datetime.strptime('12 Nov 2024', '%d %b %Y')

# Separate listening times into before and after the decision date
before_decision = {}
after_decision = {}

for date, seconds in listening_time_per_day.items():
    date_obj = datetime.strptime(date, '%d %b %Y')  # Parse the date
    if date_obj < decision_date:
        before_decision[date] = seconds
    else:
        after_decision[date] = seconds

# Function to calculate average daily and weekly listening times
def calculate_averages(listening_data):
    total_seconds = sum(listening_data.values())
    total_days = len(listening_data)
    average_daily = total_seconds / total_days if total_days > 0 else 0

    # Calculate weekly average (7-day chunks)
    average_weekly = total_seconds / (total_days / 7) if total_days > 0 else 0
    return average_daily, average_weekly

# Calculate averages for before and after the decision
avg_daily_before, avg_weekly_before = calculate_averages(before_decision)
avg_daily_after, avg_weekly_after = calculate_averages(after_decision)

# Convert seconds to HH:MM:SS for display
def format_time(seconds):
    return str(timedelta(seconds=int(seconds)))

print("Before Decision:")
print(f"  Average Daily Listening Time: {format_time(avg_daily_before)}")
print(f"  Average Weekly Listening Time: {format_time(avg_weekly_before)}")

print("\nAfter Decision:")
print(f"  Average Daily Listening Time: {format_time(avg_daily_after)}")
print(f"  Average Weekly Listening Time: {format_time(avg_weekly_after)}")


Before Decision:
  Average Daily Listening Time: 1:40:20
  Average Weekly Listening Time: 11:42:22

After Decision:
  Average Daily Listening Time: 0:28:33
  Average Weekly Listening Time: 3:19:53
