Merge pull request avinashkranjan#791 from XZANATOL/Youtube_Trending_Feed_Scrapper

avinashkranjan · web-flow · commit ab83470ba71e · 2021-04-04T23:06:14.000+05:30
Added Youtube Trending Feed Scrapper
diff --git a/Youtube Trending Feed Scrapper/ReadMe.md b/Youtube Trending Feed Scrapper/ReadMe.md
@@ -0,0 +1,42 @@
+# Youtube Trending Feed Scrapper
+
+It's a 2 scripts that is used to scrap and read the first 10 trending news in YouTube from any its available categories. Let be What's happening right ``Now``, in ``Gaming``, in ``Music``, or in ``Movies`` You will get it on your local machine.
+
+# Installation
+* Install the following Python libraries:
+> ``pip3 install selenium pymongo mongoengine pandas``
+
+
+* Place ChromeDriver in the same directory of the script. You can download it from [here](https://sites.google.com/a/chromium.org/chromedriver/downloads). <br>
+(Note: Download the one with the same version of your Chrome browser.)
+
+
+* Install MongoDB Community Server on your machine. You can refer to the installation from [here](https://docs.mongodb.com/manual/administration/install-community/).
+
+# Usage
+
+The scripts allows you to save the scrapped content using 2 methods:
+
+1) A MongoDB called ``Youtube`` and saved in a collection called ``trending``.
+2) A CSV file called ``Youtube.csv``.
+
+You can save using either or both, It's up to your desires. The same goes with ``scrap_reader.py``, It can read from either MongoDB or the CSV file.
+
+* For saving-to/reading-from a MongoDB, pass the ``-m`` argument.
+* For saving-to/reading-from a CSV file, pass the ``-c`` argument.
+
+# Output
+
+whatever the used argument to save the data is, it will be saved containing these video attributes:
+1) Video Section
+2) Video Title
+3) Video Link
+4) Video Channel
+5) Video Views
+6) Video Date
+
+# Authors
+
+Written by [XZANATOL](https://www.github.com/XZANATOL).
+
+The project was built as a contribution during [GSSOC'21](https://gssoc.girlscript.tech/).
diff --git a/Youtube Trending Feed Scrapper/scrap_reader.py b/Youtube Trending Feed Scrapper/scrap_reader.py
@@ -0,0 +1,76 @@
+# Youtube Trending Feed Reader
+# Written by XZANATOL
+from optparse import OptionParser
+from pymongo import MongoClient
+import pandas as pd
+import sys
+
+# Help menu
+usage = """
+<Script> [Options]
+
+[Options]
+    -h, --help    Shows this help message and exit
+    -c, --csv     Reads data from "Youtube.csv" file
+    -m, --mongo   Reads data from MongoDB
+"""
+
+# Load args
+parser = OptionParser()
+parser.add_option("-c", "--csv", action="store_true", dest="csv", help="Saves extracted contents to a CSV file.")
+parser.add_option("-m", "--mongo", action="store_true", dest="mongo", help="Saves extracted contents to a MongoDB.")
+
+def read_mongo():
+    # Connect to service
+    client = MongoClient("127.0.0.1")
+    # Create an object
+    db = client.Youtube.trending
+    return db.find()  # Return all values
+
+
+def read_csv():
+    # read databse
+    df = pd.read_csv("Youtube.csv")
+    data = []
+    for index, row in df.iterrows():
+        data.append(row)  # Append each dictionary to the list
+    return data  # Return all values
+
+
+def display(data):
+    i=0
+    for card in data:
+        # For every 10 cards print section
+        if i%10 ==0:
+            c = input("Show Section? [y/n] > ")
+            if c.lower() == "y":
+                print("***********************************")
+                print(f"""{card["section"]} section""")
+                print("***********************************")
+            else:
+                sys.exit()  # If had enough of reading
+        i+=1  # Increament
+        print("Title:", card["title"])
+        print("Link:",  card["link"])
+        print("Channel:", card["channel"])
+        print("Views:", card["views"])
+        print("Time:", card["date"])
+        print("==============================================")
+
+
+if __name__ == "__main__":
+    (options, args) = parser.parse_args()
+
+    # Flags
+    csv = options.csv
+    mongo = options.mongo
+    # Validate flags
+    if not (bool(csv) ^ bool(mongo)):  # XNOR Gate
+        print(usage)
+        sys.exit()
+
+    if mongo:
+        data = read_mongo()
+    else:
+        data = read_csv()
+    display(data)
diff --git a/Youtube Trending Feed Scrapper/youtube_scrapper.py b/Youtube Trending Feed Scrapper/youtube_scrapper.py
@@ -0,0 +1,144 @@
+# Youtube Trending Feed Scrapper
+# Written by XZANATOL
+from optparse import OptionParser
+from selenium import webdriver
+import pandas as pd
+import mongoengine
+import pymongo
+import time
+import sys
+
+# Help menu
+usage = """
+<Script> [Options]
+
+[Options]
+    -h, --help    Shows this help message and exit.
+    -c, --csv     Saves extracted contents to a CSV file.
+    -m, --mongo   Saves extracted contents to a MongoDB.
+"""
+
+# Load args
+parser = OptionParser()
+parser.add_option("-c", "--csv", action="store_true", dest="csv", help="Saves extracted contents to a CSV file.")
+parser.add_option("-m", "--mongo", action="store_true", dest="mongo", help="Saves extracted contents to a MongoDB.")
+
+# Defined DataFrame to avoid check errors
+df = pd.DataFrame()
+
+# MongoDB Collection (Table) Template
+class Trending(mongoengine.Document):
+    section = mongoengine.StringField(required=True)
+    title = mongoengine.StringField(required=True)
+    channel = mongoengine.StringField(required=True)
+    link = mongoengine.StringField(required=True)
+    views = mongoengine.StringField(required=True)
+    date = mongoengine.StringField(required=True)
+
+    meta = {"indexes": ["section"]}
+
+
+def load_driver():
+    """Load Chrome webdriver."""
+    driver = webdriver.Chrome("chromedriver.exe")
+    return driver
+
+
+def page_scrap(driver):
+    """Scrap YouTube trending feed."""
+    # pages to be scrapped: Now, Music, Gaming, Movies
+    pages = ["https://www.youtube.com/feed/trending",
+             "https://www.youtube.com/feed/trending?bp=4gINGgt5dG1hX2NoYXJ0cw%3D%3D",
+             "https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D",
+             "https://www.youtube.com/feed/trending?bp=4gIKGgh0cmFpbGVycw%3D%3D"]
+    sections = ["Now", "Music", "Gaming", "Movies"]
+
+    for num in range(4):
+        driver.get(pages[num])
+        time.sleep(3)  # Make sure that all the page is loaded
+        # Extract first 10 contents
+        cards = driver.find_elements_by_tag_name("ytd-video-renderer")[:10]
+        links = driver.find_elements_by_id("video-title")[:10]
+        meta_data = driver.find_elements_by_tag_name("ytd-video-meta-block")[:10]
+        for i in range(10):
+            # Splitted meta data that will be saved
+            meta_splitted = meta_data[i].text.split("\n")
+            # Sometimes this character is extracted for unknown reasons
+            try:
+                meta_splitted.remove("•")
+            except:
+                pass
+            section = sections[num]     # Scrapped from which section?
+            link = links[i].get_attribute("href")  # Video Link
+            title   = links[i].text     # Video title
+            channel = meta_splitted[0]  # Channel name
+            views   = meta_splitted[1]  # Video Views
+            date    = meta_splitted[2]  # Release date
+
+            """Arguments validation is better than making a scrapping algorithm for each"""
+            if mongo:
+                save_to_db(section, title, channel, link, views, date)
+            if csv:
+                append_to_df(section, title, channel, link, views, date)
+                
+        print(f"[+]Finished scraping '{sections[num]}' section!")
+
+    # last validation for csv
+    if csv:
+        save_to_csv()
+
+
+def save_to_db(section, title, channel, link, views, date):
+    """Saves a record to database."""
+    # Create object
+    record = Trending(
+                section = section,
+                title = title,
+                channel = channel,
+                link = link,
+                views = views,
+                date = date)
+    # Save record
+    record.save()
+
+
+def append_to_df(section, title, channel, link, views, date):
+    """Appends a record to dataframe."""
+    global df
+    df = df.append({"section": section,
+              "title": title,
+              "channel": channel,
+              "link": link,
+              "views": views,
+              "date": date,}, ignore_index=True)
+
+
+def save_to_csv():
+    """exports dataframe to a CSV file."""
+    global df
+    df.to_csv("Youtube.csv", index=False, columns=["section", "title",
+                                                 "channel", "link",
+                                                 "views", "date"])
+    # Function end (eye friendly comment to seperate the function end line)
+
+
+if __name__ == "__main__":
+    (options, args) = parser.parse_args()
+
+    # Flags
+    csv = options.csv
+    mongo = options.mongo
+    # Validate flags
+    if not (bool(csv) or bool(mongo)):
+        print(usage)
+        sys.exit()
+
+    if mongo:
+        mongoengine.connect("Youtube")
+        
+    driver = load_driver()  # load driver
+    page_scrap(driver)  # start scrapping
+    print("[+]Done !")
+    # End session
+    driver.quit()
+    sys.exit()