### (II):

In [1]:
import re

def language_detector(text):
    greek_pattern = re.compile(r'\b[α-ωά-ώΆ-ΏίϊΐόύϋΰήΑ-ΩΊΪΌΎΫ\s]+\b', re.IGNORECASE)
    english_pattern = re.compile(r'\b[a-zA-Z\s]+\b', re.IGNORECASE)
    greeklish_pattern = re.compile(r'\b[α-ωά-ώΆ-Ώίϊΐόύϋΰήa-zA-Z\s]*'
                              r'(?:g|G|γ|Γ|th|TH|χ|Χ|ei|ou|th|ch|ph|ai|oi|ei|oi|si|ti|ri|ni|xi|psi|tsi|κ|Κ|άι|έι|όι|ού|υι|ευ|ηυ|αυ|άϊ|έϊ|ώϊ|οϊ|ϋι|ίς|ής|ος|ως|ας|ές|ής|ίς|ός|ύς|ώς|εί|αί|οί|ου|άς|ές|ής|ίς|ός|ύς|ώς|εί|αί|οί|ου|ά|έ|ή|ί|ό|ύ|ώ)?'
                              r'[a-zA-Z]+'
                              r'(?:is|aki|akis)?'
                              r'[a-zA-Z\s]*\b', re.IGNORECASE)


    greek_matches = greek_pattern.findall(text)
    english_matches = english_pattern.findall(text)
    greeklish_matches = greeklish_pattern.findall(text)

    greek_ratio = len(''.join(greek_matches)) / len(text)
    english_ratio = len(''.join(english_matches)) / len(text)
    greeklish_ratio = len(''.join(greeklish_matches)) / len(text)

    if greek_ratio > 0.5 and english_ratio < 0.2:
        return "Greek"
    elif english_ratio > 0.5 and greek_ratio < 0.2:
        return "English"
    elif greeklish_ratio > 0.5:
        return "Greeklish"
    else:
        return "Other"


(1), (2), (3):

In [2]:
# !pip3 install selenium
# !pip3 install beautifulsoup4
# !pip3 install webdriver-manager
import requests, time, random, pandas as pd
from bs4 import BeautifulSoup
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from typing import Tuple, Dict, List
from selenium.webdriver.common.by import By
    
def ScrapComment(url) -> Tuple[str, Dict[str,str], List[str], str]:
    """
    Scrapes comments from a YouTube video page.
    
    Args:
    - url (str): The URL of the YouTube video.
    
    Returns:
    - tuple: A tuple containing the page title, comments dictionary, and recommended links list.
    """
    
    
    """ Since we are using Selenium to interact with the webpage,
            there is no direct need for checking of HTTP status codes.
            
       Essentially we are using a headless Chrome browser controlled by Selenium to simulate user interaction with the YouTube page.
    """
    
    # Set up Chrome optiosn to run the browser in headliess mode (without GUI)
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument("--lang=en-GB")
    
    
    # Iniatialize a Chrome WebDriver using the ChromeDriverManager to handle the executable path
    driver = webdriver.Chrome(
        executable_path=ChromeDriverManager().install(),
        options=options)
    
    # Open the specified URL in the Chrome browser
    driver.get(url)
    
    # Initialize variables for comment scraping
    prev_h = 0           # Variable to track the previous scroll height
    comments_dict = {}   # Dictionary to store comments
    comment_number = 1   # Counter for tracking comment numbers
    recommended_links = []  # List to store recommended video links
    
    # Scroll through the page to load content dynamically
    while True:
        # Calculate the current height of the page
        height = driver.execute_script("""
                function getActualHeight() {
                    return Math.max(
                        Math.max(document.body.scrollHeight, document.documentElement.scrollHeight),
                        Math.max(document.body.offsetHeight, document.documentElement.offsetHeight),
                        Math.max(document.body.clientHeight, document.documentElement.clientHeight)
                    );
                }
                return getActualHeight();
            """)
        
        # Scroll the pabe by 300 pixels each time
        driver.execute_script(f"window.scrollTo({prev_h},{prev_h + 300})")
        # Fix the time sleep value according to your network connection
        time.sleep(2)
        
        # Update the previous scroll height for the next iteration
        prev_h += 200  
        # Break if scroll reaches the end of the page
        if prev_h >= height:
            break
            
    # Find the element with the specified XPath
    # date_elements = driver.find_elements(By.XPATH, '//*[@id="header-author"]/yt-formatted-string/a')
    

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    driver.quit()
    
    # Fetch all the comments via CSS selector
    comment_divs = soup.select("#content #content-text")
    # Select the page's title via CSS selector
    page_title = soup.select_one("#container h1").text
    # Find html tag and atribute of recommented next yt_videos
    other_titles = soup.find_all('span', {'id': 'video-title'})
    
    # Find the posting date displayed in YT comments
    posting_date_elements = soup.select('#header-author yt-formatted-string a')
   

    # Iterate through the list: other_titles
    for next_title in other_titles:
        # Hold each title as text
        title_text = next_title.text
        # Detect the language of each title
        lang_detect = language_detector(title_text)
        # Apply our language detector to each title
        
        if lang_detect == 'Greeklish' or lang_detect == 'Greek':
            
            # Find the <a> element with the attribute of interest href
            link_element = next_title.find_parent('a') # 
            # Append each href in a list
            recommended_links.append(link_element.get('href'))
            
            # Print result
            # print(f"Title: {title_text} \t\t href:{recommended_links[-1]} \t\t Lang: {lang_detect}")
            
    # Collect comments
    for i, comment_div in enumerate(comment_divs):
        # Extract the text content of each comment
        comment_text = comment_div.text
        
        # Extract the text content of the corresponding date element
        posting_date = posting_date_elements[i].text
        print(f"Posting  date displayed as:{posting_date}" )
        
        # Create a key-value pair in the comments_dict, where the key is formatted as "Comment{comment_number}"
        comments_dict[f"Comment{i + 1}"] = {
            'text': comment_text,
            'date': posting_date # Add the date_text information
        }
        # Increment the comment_number for the next comment
        comment_number += 1
    
    
    return page_title , comments_dict, recommended_links




def get_comments_for_links(link: str) -> pd.DataFrame :
    """
    Gets comments for a list of YouTube video links.
    
    Args:
    - link : A randomly selected link from the code creator.
    
    Returns:
    - DataFrame: A DataFrame containing page titles, comments, and URLs.
    """
    
    # Create an empty DataFrame to store comments
    df_comments = pd.DataFrame(columns=['page_title', 'comments', 'url'])
    
    # Process the initial URL separately
    initial_url = link
    print(f"Fetching comments for: {initial_url}")
    
    page_title, comments, recommended_links = ScrapComment(initial_url)
    
    # Create a DataFrame for the initial set of comments
    df_initial = pd.DataFrame({
        'page_title': [page_title] * len(comments),
        'comments': [comment['text'] for comment in comments.values()],
        'url': [initial_url] * len(comments),
        'date':[comment['date'] for comment in comments.values()]
    })
    
    # Concatenate the initial DataFrame with the overall DataFrame
    df_comments = pd.concat([df_comments, df_initial], ignore_index=True)
    
    # Iniatialize a variable to count the iterations of the following loop
    # essentially counting the page jumps to yt reccomended videos
    iters = 0
    # Iterate through the list of links and get comments for each
    for _ in range(len(recommended_links)):
        run_time = time.time()
        # Randomly select a url from recommended_links
        link = random.choice(recommended_links)
        # Re-create the full youtube url
        
        full_url = f"https://www.youtube.com{link}"
        
        # Inform abou the link we current crawling from
        print(f"Fetching comments for: {full_url}")
        
        # Re-use the function and update the returning parameters
        page_title,comments, recommended_links= ScrapComment(full_url)
        
        # Create a DataFrame for the current set of comments
        df_current = pd.DataFrame({
            'page_title': [page_title] * len(comments),
            'comments': [comment['text'] for comment in comments.values()],
            'url': [full_url] * len(comments),
            'date':[comment['date'] for comment in comments.values()]
        })
        
        
        
        # Concatenate the current DataFrame with the overall DataFrame
        df_comments = pd.concat([df_comments, df_current], ignore_index=True)
        
        # Increment by 1 each time
        iters += 1
        
        # After 15 page jumps break the iterations or break at 10 min of running time.
        if iters == 5: # Should add run time exceptions
            break;
    
    return df_comments      
        
              

# Starting point for our crawling
url = r'https://www.youtube.com/watch?v=vSMMBjg_CKc'

# Create an instance of the returing DataFrame
df_comments = get_comments_for_links(url)
              
print(df_comments)




Fetching comments for: https://www.youtube.com/watch?v=vSMMBjg_CKc
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 7 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 10 ημέρες
Posting  date displayed as:πριν από 3 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 7 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 10 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date displayed as:πριν από 7 ημέρες
Posting  date displayed as:πριν από 11 ημέρες
Posting  date dis

Fetching comments for: https://www.youtube.com/watch?v=avB5OtMKQRc
Posting  date displayed as:πριν από 53 λεπτά
Posting  date displayed as:πριν από 4 ώρες
Posting  date displayed as:πριν από 4 ώρες
Posting  date displayed as:πριν από 3 ώρες
Posting  date displayed as:πριν από 3 ώρες
Posting  date displayed as:πριν από 3 ώρες
Posting  date displayed as:πριν από 4 ώρες
Posting  date displayed as:πριν από 4 ώρες
Posting  date displayed as:πριν από 3 ώρες (τροποποιήθηκε)
Posting  date displayed as:πριν από 2 ώρες
Posting  date displayed as:πριν από 3 ώρες
Posting  date displayed as:πριν από 3 ώρες
Posting  date displayed as:πριν από 1 ώρα
Posting  date displayed as:πριν από 1 ώρα
Posting  date displayed as:πριν από 2 ώρες
Posting  date displayed as:πριν από 4 ώρες
Posting  date displayed as:πριν από 2 ώρες
Posting  date displayed as:πριν από 4 ώρες
Posting  date displayed as:πριν από 1 ώρα
Posting  date displayed as:πριν από 4 ώρες
Posting  date displayed as:πριν από 5 ώρες
Posting  date d

In [10]:
df_comments.isna().sum()

page_title    0
comments      0
url           0
date          0
dtype: int64

In [11]:
df_comments.to_csv("../csv/crawl.csv", index=False, encoding='utf-8')

In [17]:
df_comments= pd.read_csv("../csv/crawl.csv")
df_comments['date']

0             11 days ago
1       πριν από 7 ημέρες
2      πριν από 11 ημέρες
3      πριν από 11 ημέρες
4      πριν από 10 ημέρες
              ...        
288       πριν από 4 ώρες
289        πριν από 1 ώρα
290        πριν από 1 ώρα
291        πριν από 1 ώρα
292        πριν από 1 ώρα
Name: date, Length: 293, dtype: object

Let's transform date column to a more understandable for-the-machine format.

First translate to english:

In [18]:
# Install the googletrans library
# !pip install googletrans==4.0.0-rc1

from googletrans import Translator

def translate_greek_to_english(text):
    """
    Translates text from Greek to English using the Google Translate API.

    Args:
    - text (str): The input text in Greek.

    Returns:
    - str: The translated text in English.
    """
    # Create a Translator instance
    translator = Translator()

    # Use the translate method to perform the translation
    # src='el' specifies the source language as Greek, and dest='en' specifies the target language as English
    translation = translator.translate(text, src='el', dest='en')

    # Return the translated text
    return translation.text


df_comments['date'] = df_comments['date'].apply(translate_greek_to_english)
print(df_comments['date'])

0         11 days ago
1          7 days ago
2         11 days ago
3         11 days ago
4         10 days ago
            ...      
288    before 4 hours
289        1 hour ago
290        1 hour ago
291        1 hour ago
292        1 hour ago
Name: date, Length: 293, dtype: object


Then transform to coded format:

In [19]:
from datetime import datetime, timedelta
import re

def convert_relative_date(relative_date_str):
    # Extract the number of days from the input string using regular expression
    days_ago = int(re.search(r'\d+', relative_date_str).group())

    # Calculate the actual date by subtracting the timedelta
    actual_date = datetime.now() - timedelta(days=days_ago)

    # Format the actual date as a string with only the date part
    actual_date_str = actual_date.strftime('%Y-%m-%d')

    return actual_date_str

# Use the function to 'data' column
df_comments['date']  = df_comments['date'].apply(convert_relative_date)
df_comments['date']

0      2023-11-18
1      2023-11-22
2      2023-11-18
3      2023-11-18
4      2023-11-19
          ...    
288    2023-11-25
289    2023-11-28
290    2023-11-28
291    2023-11-28
292    2023-11-28
Name: date, Length: 293, dtype: object

Unnamed: 0,page_title,comments,url,date
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
...,...,...,...,...
288,,,,
289,,,,
290,,,,
291,,,,


In [30]:
df_comments = df_comments.dropna()
print(f"Total NaN values in dataframe: {df_comments.isna().sum().sum()}")

Total NaN values in dataframe: 0


In [31]:
df_comments.to_csv("../csv/crawl.csv", index=False, encoding='utf-8')

(4):

In [32]:
df_comments.url.value_counts()


url
https://www.youtube.com/watch?v=X1ev3RxDnNI    175
https://www.youtube.com/watch?v=vSMMBjg_CKc     41
https://www.youtube.com/watch?v=avB5OtMKQRc     38
https://www.youtube.com/watch?v=gf2BG6ZFe2A     27
Name: count, dtype: int64

In [33]:
df_comments.page_title.value_counts()

page_title
ΑΜΕΤΑΝΟΗΤΟΣ Μπέος βγαίνει στον ΑΝΤ1 και τα ΞΑΝΑΧΩΝΕΙ στου πάντες χωρίς να θέλει να προσβάλει ΚΑΝΕΝΑΝ    175
Το ΜΑΚΕΛΕΙΟ Προσπαθεί Να Μπλέξει Και Τον HAYATE?                                                         41
Σταθακόπουλος: Τα «Γλυπτά του Παρθενώνα» ήταν μόνο η αφορμή                                              38
ΕΚΑΨΕ ΕΓΚΕΦΑΛΟ ο Τρύφωνος, με τύπο που ακούει τον εαυτό του αλλά δεν του λέει ΤΙΠΟΤΑ!                    27
Name: count, dtype: int64