In [47]:
import pandas as pd
import numpy as np
from apiclient.discovery import build
from pymongo import MongoClient
import os
from dotenv import load_dotenv
import string

# Setup DB and Youtube Data API

In [12]:
# connect to mongo cloud db and select collection

load_dotenv()

uri = os.getenv('MONGO_URI')

mongo_client = MongoClient(uri)

db = mongo_client['youtube-db']

In [13]:
train_col = db['training_col']
test_col = db['test_col']

In [14]:
# setting up the api client
key = os.getenv('GOOGLE_API_KEY')

client = build('youtube', 'v3', developerKey=key)

# Collecting Data

In [15]:
# categories to 0 (non educational) or 1 (educational)
def to_binary_category(c_id):
    if c_id in [27, 28, 25, 35]: # educational ids
        return 1
    else:
        return 0

# cleaning text from punctuation
def remove_punctuation(text):
    return "".join([c.lower() for c in text if c not in string.punctuation])

In [6]:
queries = ["TED-Ed - Lessons Worth Sharing", "SmarterEveryDay - To teach you something new every day!", "Vsauce", "AsapSCIENCE", 
           "National Geographic | Science, Exploration And Adventure", "CrashCourse | Educational Videos", "Numberphile", "Computerphile",
          "In a nutshell - Kurzegesagt", "Ted Talks", "Veritasium", "Vox - Explain the news", "Khan academy english", "The Backyard Scientist",
          "Big think - Get smarter, faster, for success in the knowledge economy", "MIT OpenCourseWare", "Science Channel | Science Videos",
          "minuteearth", "3Blue1Brown", "Washington Post", "The Organic Chemistry Tutor", "tecmath | Math Videos", "BBC Earth Lab", "Stanford University",
          "Astrum | YouTube", "Philosophy Tube | Philosophical YouTube Channel", "BrainCraft", "OUlearn", "BSI Academy"]

In [None]:
# fetch and insert data for multiple queries (educational)

for q in queries:
    print(q)
    # hit a search query
    search = client.search().list(q=q, part='snippet', type='video', maxResults=50).execute()
    
    # filtering out the videoIds from the search query
    videoIds = [v['id']['videoId'] for v in search['items']]
    
    # fetching video content data for all videoIds
    data = client.videos().list(id=videoIds, part="snippet").execute()
    cleaned_data = [{"video_id": e['id'], 'title': remove_punctuation(e['snippet']['title']), 'description': remove_punctuation(e['snippet']['description']), 'category_id':  
                     to_binary_category(int(e['snippet']['categoryId']))} for e in data['items']]
    
    train_col.insert_many(cleaned_data)

In [10]:
data = list(train_col.find({"category_id": 1}))

435

In [16]:
df = pd.read_csv("./data/US_youtube_trending_data.csv")

In [25]:
data = []
for index, row in df.iterrows():
    if pd.isna(row["description"]): 
        row["description"] = ""
    data.append({
        "video_id": row["video_id"],
        "title": remove_punctuation(row["title"]),
        "description": remove_punctuation(row["description"]),
        "category_id": to_binary_category(row["categoryId"])
    })

data[0]

{'video_id': '3C66w5Z0ixs',
 'title': 'i asked her to be my girlfriend',
 'description': 'subscribe to brawadis ▶ httpbitlysubscribetobrawadis\r\rfollow me on social\r▶ twitter httpstwittercombrawadis\r▶ instagram httpswwwinstagramcombrawadis\r▶ snapchat brawadis\r\rhi i’m brandon awadis and i like to make dope vlogs pranks reactions challenges and basketball videos don’t forget to subscribe and come be a part of the brawadsquad',
 'category_id': 0}

In [26]:
len(data)

131190

In [57]:
train_col.insert_many(data[45000:55000]) # index 0 to 20.000 and 25.000 and 55.000 is train data

<pymongo.results.InsertManyResult at 0x7fce2c7e4980>

In [48]:
d = list(train_col.find({"category_id": 1}))

In [49]:
len(d)

2772

In [50]:
d = list(train_col.find({"category_id": 0}))
len(d)

17778

In [58]:
test_col.insert_many(data[55000:60000]) # index 20.000 to 25.000 and 55.000 to 60.000 is test data

<pymongo.results.InsertManyResult at 0x7fce2c64b0c0>

In [54]:
d = list(test_col.find({"category_id": 1}))
len(d)

140