Skip to content

Commit ab83470

Browse files
Merge pull request avinashkranjan#791 from XZANATOL/Youtube_Trending_Feed_Scrapper
Added Youtube Trending Feed Scrapper
2 parents 34645f1 + 11aef5d commit ab83470

File tree

3 files changed

+262
-0
lines changed

3 files changed

+262
-0
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
# Youtube Trending Feed Scrapper
2+
3+
It's a 2 scripts that is used to scrap and read the first 10 trending news in YouTube from any its available categories. Let be What's happening right ``Now``, in ``Gaming``, in ``Music``, or in ``Movies`` You will get it on your local machine.
4+
5+
# Installation
6+
* Install the following Python libraries:
7+
> ``pip3 install selenium pymongo mongoengine pandas``
8+
9+
10+
* Place ChromeDriver in the same directory of the script. You can download it from [here](https://sites.google.com/a/chromium.org/chromedriver/downloads). <br>
11+
(Note: Download the one with the same version of your Chrome browser.)
12+
13+
14+
* Install MongoDB Community Server on your machine. You can refer to the installation from [here](https://docs.mongodb.com/manual/administration/install-community/).
15+
16+
# Usage
17+
18+
The scripts allows you to save the scrapped content using 2 methods:
19+
20+
1) A MongoDB called ``Youtube`` and saved in a collection called ``trending``.
21+
2) A CSV file called ``Youtube.csv``.
22+
23+
You can save using either or both, It's up to your desires. The same goes with ``scrap_reader.py``, It can read from either MongoDB or the CSV file.
24+
25+
* For saving-to/reading-from a MongoDB, pass the ``-m`` argument.
26+
* For saving-to/reading-from a CSV file, pass the ``-c`` argument.
27+
28+
# Output
29+
30+
whatever the used argument to save the data is, it will be saved containing these video attributes:
31+
1) Video Section
32+
2) Video Title
33+
3) Video Link
34+
4) Video Channel
35+
5) Video Views
36+
6) Video Date
37+
38+
# Authors
39+
40+
Written by [XZANATOL](https://www.github.com/XZANATOL).
41+
42+
The project was built as a contribution during [GSSOC'21](https://gssoc.girlscript.tech/).
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
# Youtube Trending Feed Reader
2+
# Written by XZANATOL
3+
from optparse import OptionParser
4+
from pymongo import MongoClient
5+
import pandas as pd
6+
import sys
7+
8+
# Help menu
9+
usage = """
10+
<Script> [Options]
11+
12+
[Options]
13+
-h, --help Shows this help message and exit
14+
-c, --csv Reads data from "Youtube.csv" file
15+
-m, --mongo Reads data from MongoDB
16+
"""
17+
18+
# Load args
19+
parser = OptionParser()
20+
parser.add_option("-c", "--csv", action="store_true", dest="csv", help="Saves extracted contents to a CSV file.")
21+
parser.add_option("-m", "--mongo", action="store_true", dest="mongo", help="Saves extracted contents to a MongoDB.")
22+
23+
def read_mongo():
24+
# Connect to service
25+
client = MongoClient("127.0.0.1")
26+
# Create an object
27+
db = client.Youtube.trending
28+
return db.find() # Return all values
29+
30+
31+
def read_csv():
32+
# read databse
33+
df = pd.read_csv("Youtube.csv")
34+
data = []
35+
for index, row in df.iterrows():
36+
data.append(row) # Append each dictionary to the list
37+
return data # Return all values
38+
39+
40+
def display(data):
41+
i=0
42+
for card in data:
43+
# For every 10 cards print section
44+
if i%10 ==0:
45+
c = input("Show Section? [y/n] > ")
46+
if c.lower() == "y":
47+
print("***********************************")
48+
print(f"""{card["section"]} section""")
49+
print("***********************************")
50+
else:
51+
sys.exit() # If had enough of reading
52+
i+=1 # Increament
53+
print("Title:", card["title"])
54+
print("Link:", card["link"])
55+
print("Channel:", card["channel"])
56+
print("Views:", card["views"])
57+
print("Time:", card["date"])
58+
print("==============================================")
59+
60+
61+
if __name__ == "__main__":
62+
(options, args) = parser.parse_args()
63+
64+
# Flags
65+
csv = options.csv
66+
mongo = options.mongo
67+
# Validate flags
68+
if not (bool(csv) ^ bool(mongo)): # XNOR Gate
69+
print(usage)
70+
sys.exit()
71+
72+
if mongo:
73+
data = read_mongo()
74+
else:
75+
data = read_csv()
76+
display(data)
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
# Youtube Trending Feed Scrapper
2+
# Written by XZANATOL
3+
from optparse import OptionParser
4+
from selenium import webdriver
5+
import pandas as pd
6+
import mongoengine
7+
import pymongo
8+
import time
9+
import sys
10+
11+
# Help menu
12+
usage = """
13+
<Script> [Options]
14+
15+
[Options]
16+
-h, --help Shows this help message and exit.
17+
-c, --csv Saves extracted contents to a CSV file.
18+
-m, --mongo Saves extracted contents to a MongoDB.
19+
"""
20+
21+
# Load args
22+
parser = OptionParser()
23+
parser.add_option("-c", "--csv", action="store_true", dest="csv", help="Saves extracted contents to a CSV file.")
24+
parser.add_option("-m", "--mongo", action="store_true", dest="mongo", help="Saves extracted contents to a MongoDB.")
25+
26+
# Defined DataFrame to avoid check errors
27+
df = pd.DataFrame()
28+
29+
# MongoDB Collection (Table) Template
30+
class Trending(mongoengine.Document):
31+
section = mongoengine.StringField(required=True)
32+
title = mongoengine.StringField(required=True)
33+
channel = mongoengine.StringField(required=True)
34+
link = mongoengine.StringField(required=True)
35+
views = mongoengine.StringField(required=True)
36+
date = mongoengine.StringField(required=True)
37+
38+
meta = {"indexes": ["section"]}
39+
40+
41+
def load_driver():
42+
"""Load Chrome webdriver."""
43+
driver = webdriver.Chrome("chromedriver.exe")
44+
return driver
45+
46+
47+
def page_scrap(driver):
48+
"""Scrap YouTube trending feed."""
49+
# pages to be scrapped: Now, Music, Gaming, Movies
50+
pages = ["https://www.youtube.com/feed/trending",
51+
"https://www.youtube.com/feed/trending?bp=4gINGgt5dG1hX2NoYXJ0cw%3D%3D",
52+
"https://www.youtube.com/feed/trending?bp=4gIcGhpnYW1pbmdfY29ycHVzX21vc3RfcG9wdWxhcg%3D%3D",
53+
"https://www.youtube.com/feed/trending?bp=4gIKGgh0cmFpbGVycw%3D%3D"]
54+
sections = ["Now", "Music", "Gaming", "Movies"]
55+
56+
for num in range(4):
57+
driver.get(pages[num])
58+
time.sleep(3) # Make sure that all the page is loaded
59+
# Extract first 10 contents
60+
cards = driver.find_elements_by_tag_name("ytd-video-renderer")[:10]
61+
links = driver.find_elements_by_id("video-title")[:10]
62+
meta_data = driver.find_elements_by_tag_name("ytd-video-meta-block")[:10]
63+
for i in range(10):
64+
# Splitted meta data that will be saved
65+
meta_splitted = meta_data[i].text.split("\n")
66+
# Sometimes this character is extracted for unknown reasons
67+
try:
68+
meta_splitted.remove("•")
69+
except:
70+
pass
71+
section = sections[num] # Scrapped from which section?
72+
link = links[i].get_attribute("href") # Video Link
73+
title = links[i].text # Video title
74+
channel = meta_splitted[0] # Channel name
75+
views = meta_splitted[1] # Video Views
76+
date = meta_splitted[2] # Release date
77+
78+
"""Arguments validation is better than making a scrapping algorithm for each"""
79+
if mongo:
80+
save_to_db(section, title, channel, link, views, date)
81+
if csv:
82+
append_to_df(section, title, channel, link, views, date)
83+
84+
print(f"[+]Finished scraping '{sections[num]}' section!")
85+
86+
# last validation for csv
87+
if csv:
88+
save_to_csv()
89+
90+
91+
def save_to_db(section, title, channel, link, views, date):
92+
"""Saves a record to database."""
93+
# Create object
94+
record = Trending(
95+
section = section,
96+
title = title,
97+
channel = channel,
98+
link = link,
99+
views = views,
100+
date = date)
101+
# Save record
102+
record.save()
103+
104+
105+
def append_to_df(section, title, channel, link, views, date):
106+
"""Appends a record to dataframe."""
107+
global df
108+
df = df.append({"section": section,
109+
"title": title,
110+
"channel": channel,
111+
"link": link,
112+
"views": views,
113+
"date": date,}, ignore_index=True)
114+
115+
116+
def save_to_csv():
117+
"""exports dataframe to a CSV file."""
118+
global df
119+
df.to_csv("Youtube.csv", index=False, columns=["section", "title",
120+
"channel", "link",
121+
"views", "date"])
122+
# Function end (eye friendly comment to seperate the function end line)
123+
124+
125+
if __name__ == "__main__":
126+
(options, args) = parser.parse_args()
127+
128+
# Flags
129+
csv = options.csv
130+
mongo = options.mongo
131+
# Validate flags
132+
if not (bool(csv) or bool(mongo)):
133+
print(usage)
134+
sys.exit()
135+
136+
if mongo:
137+
mongoengine.connect("Youtube")
138+
139+
driver = load_driver() # load driver
140+
page_scrap(driver) # start scrapping
141+
print("[+]Done !")
142+
# End session
143+
driver.quit()
144+
sys.exit()

0 commit comments

Comments
 (0)