<a href="https://colab.research.google.com/github/Amna9191/DSA210_TermProject/blob/main/DSA210TermProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
#Extract Data from html file and save to csv

import csv
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime

# Function to export music history to a CSV file
def save_music_history_to_csv(music_history, output_file='music_history.csv'):
    headers = ['Song Title', 'Song URL', 'Artist', 'Date', 'Time']  # Added 'Date' and 'Time'

    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=headers)
        writer.writeheader()  # Write the header row
        for entry in music_history:
            writer.writerow({
                'Song Title': entry['song_title'],
                'Song URL': entry['song_url'],
                'Artist': entry['channel_name'],
                'Date': entry['date'],
                'Time': entry['time']
            })

    print(f"Music history successfully exported to {output_file}")

# Function to load and parse the HTML file
def load_html(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        soup = BeautifulSoup(file, 'lxml')
    return soup

# Function to extract YouTube Music history
def extract_youtube_music_history(soup):
    music_history = []

    music_entries = soup.find_all('div', class_='outer-cell mdl-cell mdl-cell--12-col mdl-shadow--2dp')

    for entry in music_entries:
        header = entry.find('p', class_='mdl-typography--title')
        if header and 'YouTube Music' in header.text.strip():
            song_tag = entry.find('a', href=True)
            if song_tag:
                song_title = song_tag.text.strip()
                song_url = song_tag.get('href')

                channel_tag = song_tag.find_next('a', href=True)
                if channel_tag:
                    channel_name = channel_tag.text.strip().replace(" - Topic", "")

                timestamp_tag = entry.find('div', class_='content-cell mdl-cell mdl-cell--6-col mdl-typography--body-1')
                if timestamp_tag:
                    timestamp_full = timestamp_tag.text.strip()
                    match = re.search(r'((Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s\d{1,2},\s\d{4}),\s([\d:]+\s[AP]M)\sGMT[+\-]\d{2}:\d{2}', timestamp_full)
                    if match:
                        date_full = match.group(1)  # Extract the full date
                        time_12hr = match.group(3)  # Extract the 12-hour format time

                        # Convert 12-hour format to 24-hour format
                        time_24hr = datetime.strptime(time_12hr, '%I:%M:%S %p').strftime('%H:%M:%S')
                        date = datetime.strptime(date_full, '%b %d, %Y').strftime('%d %b %Y')  # Format date to include day, month, and year
                    else:
                        date = "No date found"
                        time_24hr = "No time found"
                else:
                    date = "No date found"
                    time_24hr = "No time found"

                music_history.append({
                    'song_title': song_title,
                    'song_url': 'https://music.youtube.com' + song_url,
                    'channel_name': channel_name,
                    'date': date,
                    'time': time_24hr
                })

    return music_history

# Function to display or process the data
def print_music_history(music_history):
    for entry in music_history:
        print(f"Song Title: {entry['song_title']}")
        print(f"Song URL: {entry['song_url']}")
        print(f"Artist: {entry['channel_name']}")
        print(f"Date: {entry['date']}")
        print(f"Time: {entry['time']}")
        print("-" * 40)

In [11]:
# Function to filter songs by date range and save to a new CSV
def filter_songs_by_date_range(input_file, output_file, start_date, end_date):
    try:
        # Convert start and end dates to datetime objects
        start_date = datetime.strptime(start_date, '%d %b %Y')
        end_date = datetime.strptime(end_date, '%d %b %Y')

        filtered_songs = []

        # Read the input CSV file
        with open(input_file, mode='r', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                song_date = datetime.strptime(row['Date'], '%d %b %Y')
                if start_date <= song_date <= end_date:
                    filtered_songs.append(row)

        # Write the filtered songs to the new CSV file
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=['Song Title', 'Song URL', 'Artist', 'Date', 'Time'])
            writer.writeheader()
            writer.writerows(filtered_songs)

        print(f"Filtered songs successfully saved to {output_file}")

    except Exception as e:
        print(f"An error occurred: {e}")


In [None]:
# Filter history by dates

filter_songs_by_date_range(
  input_file='music_history.csv',
  output_file='filtered_music_history.csv',
  start_date='01 Jul 2024',
  end_date='20 Oct 2024'
)