In [1]:
import os
import requests
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
# GitHub repository URL containing text files
github_repo_url = 'https://github.com/612kelly/MapReduce_lyrics/tree/new_branch/taylor_lyrics/'

In [3]:
# GitHub repository URL
github_repo_owner = '612kelly'
github_repo_name = 'MapReduce_lyrics'
branch_name = 'new_branch'
api_url = f'https://api.github.com/repos/{github_repo_owner}/{github_repo_name}/git/trees/{branch_name}?recursive=1'


In [4]:
# Output folder for processed data
output_folder = r'C:\Users\User\Documents\Uni sem 9\Big Data\github\taylor_lyrics_processed2'

In [5]:
# Fetch the HTML content of the GitHub repository
response = requests.get(api_url)
data = response.json()

In [6]:
# Define a set of English stopwords
stop_words = set(stopwords.words('english'))

In [7]:
# Process each file in the repository
for item in data.get('tree', []):
    file_path = item.get('path', '')
    
    # Check if the file is a text file
    if file_path.endswith('.txt'):
        print(f"Processing file: {file_path}")

        # Fetch the content of the file
        raw_url = f'https://raw.githubusercontent.com/{github_repo_owner}/{github_repo_name}/{branch_name}/{file_path}'
        file_response = requests.get(raw_url)
        file_text = file_response.text

        # Remove words inside square brackets using regular expression
        processed_text = re.sub(r'\[.*?\]', '', file_text)

        # Remove content inside parentheses
        processed_text = re.sub(r'\(.*?\)', '', processed_text)

        # Remove numbers, commas, and other symbols
        processed_text = re.sub(r'[^a-zA-Z\s]', '', processed_text)

        # Remove stopwords from each line without rearranging
        processed_lines = []
        original_lines = processed_text.split('\n')
        for original_line in original_lines:
            filtered_line = ' '.join([word for word in original_line.split() if word.lower() not in stop_words])
            processed_lines.append(filtered_line)

        # Join the processed lines back into a string
        processed_text = '\n'.join(processed_lines)


        # Extract album name from the GitHub file path
        album_name = os.path.dirname(file_path).replace('taylor_lyrics/', '')
        print(f"Album name: {album_name}")

        # Save the preprocessed text in a file with the same name as the text file
        file_name = os.path.basename(file_path)
        output_path = os.path.join(output_folder, album_name, file_name)
        os.makedirs(os.path.dirname(output_path), exist_ok=True)

        with open(output_path, 'w', encoding='utf-8') as output_file:
            output_file.write(processed_text)

Processing file: taylor_lyrics/01_taylor-swift/01_tim-mcgraw.txt
Album name: 01_taylor-swift
Processing file: taylor_lyrics/01_taylor-swift/02_picture-to-burn.txt
Album name: 01_taylor-swift
Processing file: taylor_lyrics/01_taylor-swift/03_teardrops-on-my-guitar.txt
Album name: 01_taylor-swift
Processing file: taylor_lyrics/01_taylor-swift/04_a-place-in-this-world.txt
Album name: 01_taylor-swift
Processing file: taylor_lyrics/01_taylor-swift/05_cold-as-you.txt
Album name: 01_taylor-swift
Processing file: taylor_lyrics/01_taylor-swift/06_the-outside.txt
Album name: 01_taylor-swift
Processing file: taylor_lyrics/01_taylor-swift/07_tied-together-with-a-smile.txt
Album name: 01_taylor-swift
Processing file: taylor_lyrics/01_taylor-swift/08_stay-beautiful.txt
Album name: 01_taylor-swift
Processing file: taylor_lyrics/01_taylor-swift/09_shouldve-said-no.txt
Album name: 01_taylor-swift
Processing file: taylor_lyrics/01_taylor-swift/10_marys-song.txt
Album name: 01_taylor-swift
Processing fil

In [8]:
raw_url

'https://raw.githubusercontent.com/612kelly/MapReduce_lyrics/new_branch/taylor_lyrics_processed/99_features/two-is-better-than-one.txt'

In [9]:
file_path

'taylor_lyrics_processed/99_features/two-is-better-than-one.txt'

In [10]:
os.path.basename(file_path)

'two-is-better-than-one.txt'

In [11]:
# Function to download and preprocess text from a GitHub repository
def preprocess_github_data(repo_url, output_folder):
    # Fetch the HTML content of the GitHub repository
    response = requests.get(repo_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links to text files in the repository
    links = soup.find_all('a', href=re.compile(r'\.txt$'))

    # Initialize the main output folder
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    print(f"Output folder: {output_folder}")

    # Loop through each link and preprocess the text
    for link in links:
        file_url = link['href']
        print(f"Processing file: {file_url}")

        file_response = requests.get(file_url)
        file_text = file_response.text

        # Remove words inside square brackets using regular expression
        processed_text = re.sub(r'\[.*?\]', '', file_text)

        # Remove content inside parentheses
        processed_text = re.sub(r'\(.*?\)', '', processed_text)

        # Remove numbers, commas, and other symbols
        processed_text = re.sub(r'[^a-zA-Z\s]', '', processed_text)

        # Extract album name from the file URL
        album_name = file_url.split('/')[-2]
        print(f"Album name: {album_name}")

        # Create a folder for the album if it doesn't exist
        album_folder = os.path.join(output_folder, album_name)
        if not os.path.exists(album_folder):
            os.makedirs(album_folder)

        # Save the preprocessed text in a file with the same name as the text file
        file_name = os.path.basename(file_url)
        output_path = os.path.join(album_folder, file_name)
        print(f"Saving to: {output_path}")

        with open(output_path, 'w', encoding='utf-8') as output_file:
            output_file.write(processed_text)

In [12]:
# Perform preprocessing
preprocess_github_data(github_repo_url, output_folder)

Output folder: C:\Users\User\Documents\Uni sem 9\Big Data\github\taylor_lyrics_processed2
