<a href="https://colab.research.google.com/github/Amna9191/DSA210_TermProject/blob/main/DSA210TermProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Extract Data from html file and save to csv

import csv
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime

# Function to export music history to a CSV file
def save_music_history_to_csv(music_history, output_file='music_history.csv'):
    headers = ['Song Title', 'Song URL', 'Artist', 'Date', 'Time']  # Added 'Date' and 'Time'

    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()  # Write the header row
        for entry in music_history:
            writer.writerow({
                'Song Title': entry['song_title'],
                'Song URL': entry['song_url'],
                'Artist': entry['channel_name'],
                'Date': entry['date'],
                'Time': entry['time']
            })

    print(f"Music history successfully exported to {output_file}")

# Function to load and parse the HTML file
def load_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
    return soup

# Function to extract YouTube Music history
def extract_youtube_music_history(soup):
    music_history = []

    music_entries = soup.find_all('div', class_='outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp')

    for entry in music_entries:
        header = entry.find('p', class_='mdl-typography--title')
        if header and 'YouTube Music' in header.text.strip():
            song_tag = entry.find('a', href=True)
            if song_tag:
                song_title = song_tag.text.strip()
                song_url = song_tag.get('href')

                channel_tag = song_tag.find_next('a', href=True)
                if channel_tag:
                    channel_name = channel_tag.text.strip().replace(" - Topic", "")

                timestamp_tag = entry.find('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')
                if timestamp_tag:
                    timestamp_full = timestamp_tag.text.strip()
                    match = re.search(r'((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2},\s\d{4}),\s([\d:]+\s[AP]M)\sGMT[+\-]\d{2}:\d{2}', timestamp_full)
                    if match:
                        date_full = match.group(1)  # Extract the full date
                        time_12hr = match.group(3)  # Extract the 12-hour format time

                        # Convert 12-hour format to 24-hour format
                        time_24hr = datetime.strptime(time_12hr, '%I:%M:%S %p').strftime('%H:%M:%S')
                        date = datetime.strptime(date_full, '%b %d, %Y').strftime('%d %b %Y')  # Format date to include day, month, and year
                    else:
                        date = "No date found"
                        time_24hr = "No time found"
                else:
                    date = "No date found"
                    time_24hr = "No time found"

                music_history.append({
                    'song_title': song_title,
                    'song_url': 'https://music.youtube.com' + song_url,
                    'channel_name': channel_name,
                    'date': date,
                    'time': time_24hr
                })

    return music_history

# Function to display or process the data
def print_music_history(music_history):
    for entry in music_history:
        print(f"Song Title: {entry['song_title']}")
        print(f"Song URL: {entry['song_url']}")
        print(f"Artist: {entry['channel_name']}")
        print(f"Date: {entry['date']}")
        print(f"Time: {entry['time']}")
        print("-" * 40)

In [2]:
# Function to filter songs by date range and save to a new CSV
def filter_songs_by_date_range(input_file, output_file, start_date, end_date):
    try:
        # Convert start and end dates to datetime objects
        start_date = datetime.strptime(start_date, '%d %b %Y')
        end_date = datetime.strptime(end_date, '%d %b %Y')

        filtered_songs = []

        # Read the input CSV file
        with open(input_file, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                song_date = datetime.strptime(row['Date'], '%d %b %Y')
                if start_date <= song_date <= end_date:
                    filtered_songs.append(row)

        # Write the filtered songs to the new CSV file
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=['Song Title', 'Song URL', 'Artist', 'Date', 'Time'])
            writer.writeheader()
            writer.writerows(filtered_songs)

        print(f"Filtered songs successfully saved to {output_file}")

    except Exception as e:
        print(f"An error occurred: {e}")


In [3]:
file_path = 'watch-history.html'  # Path to your HTML file

if os.path.exists(file_path):
    soup = load_html(file_path)  # Load and parse the HTML file
    music_history = extract_youtube_music_history(soup)  # Extract music history

    if music_history:
        print_music_history(music_history)  # Display data (optional)
        save_music_history_to_csv(music_history, 'music_history.csv')  # Save to CSV
    else:
        print("No YouTube Music history found in the provided HTML file.")
else:
    print(f"File not found: {file_path}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
----------------------------------------
Song Title: Mukhda Vekh Ke
Song URL: https://music.youtube.comhttps://music.youtube.com/watch?v=ZxiCg4p5eXs
Artist: Mika Singh
Date: 10 Sep 2023
Time: 16:11:17
----------------------------------------
Song Title: The Hook Up Song
Song URL: https://music.youtube.comhttps://music.youtube.com/watch?v=JXZHzxusjPk
Artist: Vishal - Shekhar
Date: 10 Sep 2023
Time: 16:10:43
----------------------------------------
Song Title: Mumbai Dilli Di Kudiyaan
Song URL: https://music.youtube.comhttps://music.youtube.com/watch?v=UrOiwW4Rneg
Artist: Vishal - Shekhar
Date: 10 Sep 2023
Time: 16:07:12
----------------------------------------
Song Title: GHAGRA
Song URL: https://music.youtube.comhttps://music.youtube.com/watch?v=58jgsk7i3gQ
Artist: Rekha Bhardwaj
Date: 10 Sep 2023
Time: 16:05:27
----------------------------------------
Song Title: Udd Jaa Kaale Kaava [Climax Version]
Song URL: https://mus

In [4]:
# Filter history by dates
filter_songs_by_date_range(
  input_file='music_history.csv',
  output_file='filtered_music_history.csv',
  start_date='01 Jul 2024',
  end_date='20 Oct 2024'
)

Filtered songs successfully saved to filtered_music_history.csv


In [11]:
import csv
from datetime import datetime

# Function to sort a CSV file by date and time
def sort_csv_by_date_and_time(input_file, output_file='sorted_music_history.csv'):
    try:
        # Read the CSV file
        with open(input_file, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)  # Read all rows as a list of dictionaries

        # Ensure 'Date' and 'Time' columns exist
        if 'Date' not in rows[0] or 'Time' not in rows[0]:
            print("The input CSV file must contain 'Date' and 'Time' columns.")
            return

        # Sort rows by date and time
        sorted_rows = sorted(
            rows,
            key=lambda x: (datetime.strptime(x['Date'], '%d %b %Y'), datetime.strptime(x['Time'], '%H:%M:%S'))
        )

        # Write the sorted data to a new CSV file
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=reader.fieldnames)
            writer.writeheader()  # Write the header
            writer.writerows(sorted_rows)  # Write sorted rows

        print(f"Sorted CSV successfully saved to {output_file}")

    except FileNotFoundError:
        print(f"File not found: {input_file}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
if __name__ == "__main__":
    input_csv = "filtered_music_history.csv"  # Replace with your file path
    sort_csv_by_date_and_time(input_csv, "sorted_music_history.csv")


Sorted CSV successfully saved to sorted_music_history.csv


In [16]:
import random
from datetime import datetime, timedelta
import csv

# Function to calculate song duration based on the start time of the next song
def calculate_duration(start_time_str, next_start_time_str):
    # Parse the start times
    start_time = datetime.strptime(start_time_str, '%H:%M:%S')
    next_start_time = datetime.strptime(next_start_time_str, '%H:%M:%S')

    # Calculate the duration as the difference between the next song's start time and the current song's start time
    duration = next_start_time - start_time

    # If duration exceeds 7 minutes or is negative (next song is on a later day)
    if duration > timedelta(minutes=7) or duration < timedelta(seconds=0):
        # Select a random duration between 2 and 7 minutes
        random_minutes = random.randint(2, 7)
        random_seconds = random.randint(0, 59)
        random_duration = timedelta(minutes=random_minutes, seconds=random_seconds)
        return random_duration

    return duration

# Function to format timedelta as "minutes:seconds"
def format_duration(duration):
    # Convert timedelta to minutes and seconds format (MM:SS)
    total_seconds = int(duration.total_seconds())
    minutes = total_seconds // 60
    seconds = total_seconds % 60
    return f"{minutes}:{seconds:02d}"

# Update the CSV processing to include a "Duration" column
def add_duration_to_csv(input_file, output_file='music_with_duration.csv'):
    try:
        # Read the CSV file
        with open(input_file, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            rows = list(reader)  # Read all rows as a list of dictionaries

        # Ensure 'Time' column exists
        if 'Time' not in rows[0]:
            print("The input CSV file must contain a 'Time' column.")
            return

        # Add a "Duration" column for each song
        updated_rows = []  # This will hold rows with valid durations
        for i in range(len(rows) - 1):
            current_song = rows[i]
            next_song = rows[i + 1]

            # If the song title is a URL, set it to "Unknown"
            if current_song['Song Title'].startswith('https'):
                current_song['Song Title'] = "Unknown"

            current_start_time = current_song['Time']
            next_start_time = next_song['Time']

            # Calculate the duration of the current song based on the next song's start time
            duration = calculate_duration(current_start_time, next_start_time)
            formatted_duration = format_duration(duration)  # Format the duration as MM:SS

            # Only include songs with a valid duration (not 0:0:0)
            if formatted_duration != "0:00":
                current_song['Duration'] = formatted_duration
                updated_rows.append(current_song)  # Add row with valid duration

        # For the last song, set the duration as "Unknown" or any other default value
        rows[-1]['Duration'] = "Unknown"
        updated_rows.append(rows[-1])  # Add the last song (since no next song to calculate duration)

        # Write the updated data to a new CSV file
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            fieldnames = reader.fieldnames + ['Duration']
            writer = csv.DictWriter(file, fieldnames=fieldnames)
            writer.writeheader()  # Write the header
            writer.writerows(updated_rows)  # Write updated rows

        print(f"CSV with song durations saved to {output_file}")

    except FileNotFoundError:
        print(f"File not found: {input_file}")
    except Exception as e:
        print(f"An error occurred: {e}")

# Example usage
if __name__ == "__main__":
    input_csv = "sorted_music_history.csv"  # Replace with your file path
    add_duration_to_csv(input_csv, "music_with_duration.csv")


CSV with song durations saved to music_with_duration.csv
