

### Goal: ML model that takes video features (title and description) and classifies it as **educational** or **non-educational** video

## 1. Collecting training data

### Setup DB, Youtube Data API and helper functions

In [None]:
uri = os.getenv('MONGO_URI')
mongo_client = MongoClient(uri)
db = mongo_client['youtube-db']

train_col = db['training_col']
test_col = db['test_col']

# Setting up the api client
key = os.getenv('GOOGLE_API_KEY')

client = build('youtube', 'v3', developerKey=key)

# Categories to 0 (non educational) or 1 (educational)
def to_binary_category(c_id):
    if c_id in [27, 28, 25, 35]: # educational ids
        return 1
    else:
        return 0

# Cleaning text from punctuation
def remove_punctuation(text):
    return "".join([c.lower() for c in text if c not in string.punctuation])

### a) Collecting data using popular educational videos

In [None]:
queries = ["TED-Ed - Lessons Worth Sharing", "SmarterEveryDay - To teach you something new every day!", "Vsauce", "AsapSCIENCE", 
           "National Geographic | Science, Exploration And Adventure", "CrashCourse | Educational Videos", "Numberphile", "Computerphile",
          "In a nutshell - Kurzegesagt", "Ted Talks", "Veritasium", "Vox - Explain the news", "Khan academy english", "The Backyard Scientist",
          "Big think - Get smarter, faster, for success in the knowledge economy", "MIT OpenCourseWare", "Science Channel | Science Videos",
          "minuteearth", "3Blue1Brown", "Washington Post", "The Organic Chemistry Tutor", "tecmath | Math Videos", "BBC Earth Lab", "Stanford University",
          "Astrum | YouTube", "Philosophy Tube | Philosophical YouTube Channel", "BrainCraft", "OUlearn", "BSI Academy"]

In [None]:
# Fetch and insert data for multiple queries (educational)

for q in queries:
    print(q)
    # hit a search query
    search = client.search().list(q=q, part='snippet', type='video', maxResults=50).execute()
    
    # filtering out the videoIds from the search query
    videoIds = [v['id']['videoId'] for v in search['items']]
    
    # fetching video content data for all videoIds
    data = client.videos().list(id=videoIds, part="snippet").execute()
    cleaned_data = [{"video_id": e['id'], 'title': remove_punctuation(e['snippet']['title']), 'description': remove_punctuation(e['snippet']['description']), 'category_id':  
                     to_binary_category(int(e['snippet']['categoryId']))} for e in data['items']]
    
    train_col.insert_many(cleaned_data)

### b) Collecting data using dataset from Kaggle (better option)

In [None]:
df = pd.read_csv("./data/US_youtube_trending_data.csv")

In [None]:
data = []
for index, row in df.iterrows():
    if pd.isna(row["description"]): 
        row["description"] = ""
    data.append({
        "video_id": row["video_id"],
        "title": remove_punctuation(row["title"]),
        "description": remove_punctuation(row["description"]),
        "category_id": to_binary_category(row["categoryId"])
    })

data[0]

{'video_id': '3C66w5Z0ixs',
 'title': 'i asked her to be my girlfriend',
 'description': 'subscribe to brawadis ▶ httpbitlysubscribetobrawadis\r\rfollow me on social\r▶ twitter httpstwittercombrawadis\r▶ instagram httpswwwinstagramcombrawadis\r▶ snapchat brawadis\r\rhi i’m brandon awadis and i like to make dope vlogs pranks reactions challenges and basketball videos don’t forget to subscribe and come be a part of the brawadsquad',
 'category_id': 0}

In [None]:
train_col.insert_many(data[45000:55000]) # index 0 to 20.000 and 25.000 and 55.000 is train data

<pymongo.results.InsertManyResult at 0x7fce2c7e4980>

# 2. Building and testing ML models using Scikit-learn

### a) SGD Classifier

In [None]:
# Building ML Pipeline

sgd_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=1000, tol=None)),
    ])

sgd_clf.fit(X_train, y_train)

In [None]:
# Evaluating metrics on test data

predicted = sgd_clf.predict(X_test)
print("Metrics on test data from split:")
print(metrics.classification_report(predicted, y_test))

# Evaluating clf against completly new random data

predicted_new = sgd_clf.predict(X_new)
print("Metrics on new random data:")
print(metrics.classification_report(predicted_new, y_new))

Metrics on test data from split:
              precision    recall  f1-score   support

           0       1.00      0.90      0.95      7970
           1       0.15      0.99      0.27       140

    accuracy                           0.91      8110
   macro avg       0.58      0.95      0.61      8110
weighted avg       0.99      0.91      0.94      8110

Metrics on new random data:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96      4948
           1       0.13      1.00      0.23        52

    accuracy                           0.93      5000
   macro avg       0.57      0.97      0.60      5000
weighted avg       0.99      0.93      0.96      5000



### b) Random Forest Classifier

In [None]:
# Building ML Pipeline

rf_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('clf', RandomForestClassifier()),
    ])

rf_clf.fit(X_train, y_train)

In [None]:
# Evaluating metrics on test data

predicted = rf_clf.predict(X_test)
print(metrics.classification_report(predicted, y_test))

# Evaluating clf against completly new random data

predicted_new = rf_clf.predict(X_new)
print("Metrics on new random data:")
print(metrics.classification_report(predicted_new, y_new))
print(metrics.confusion_matrix(predicted_new, y_new))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7185
           1       0.98      1.00      0.99       925

    accuracy                           1.00      8110
   macro avg       0.99      1.00      0.99      8110
weighted avg       1.00      1.00      1.00      8110

Metrics on new random data:
              precision    recall  f1-score   support

           0       1.00      0.97      0.98      4743
           1       0.63      0.98      0.77       257

    accuracy                           0.97      5000
   macro avg       0.81      0.97      0.88      5000
weighted avg       0.98      0.97      0.97      5000

[[4596  147]
 [   6  251]]


### c) Multi Layer Perceptron Classifier (best option)

In [None]:
# Building ML Pipeline

mlp_clf = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
    ('cclf', MLPClassifier(verbose=True))
    ])
    
mlp_clf.fit(X_train, y_train)

In [None]:
# Evaluating metrics on test data
predicted = mlp_clf.predict(X_test)
print(metrics.classification_report(predicted, y_test))

# Evaluating clf against completly new random data
predicted_new = mlp_clf.predict(X_new)
print("Metrics on new random data:")
print(metrics.classification_report(predicted_new, y_new))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7176
           1       0.99      1.00      0.99       934

    accuracy                           1.00      8110
   macro avg       0.99      1.00      1.00      8110
weighted avg       1.00      1.00      1.00      8110

Metrics on new random data:
              precision    recall  f1-score   support

           0       1.00      0.97      0.99      9366
           1       0.71      0.97      0.82       634

    accuracy                           0.97     10000
   macro avg       0.85      0.97      0.90     10000
weighted avg       0.98      0.97      0.97     10000



# Next steps and challenges

### - optimize the features being analyzed

### - optimize the MLP parameters

### - integrate the created model in a web service using Flask

### - add feedback functionality