In [3]:
!pip install scikit-learn


Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp311-cp311-macosx_10_13_x86_64.whl.metadata (61 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp311-cp311-macosx_10_9_x86_64.whl (12.1 MB)
Downloading scipy-1.15.2-cp311-cp311-macosx_10_13_x86_64.whl (38.7 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m38.7/38.7 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0mm[36m0:00:01[0m
[?25hDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.6.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

custom_stop_words = list(ENGLISH_STOP_WORDS) + ["http", "https", "com", "www", "youtu", "youtube", "bit", "ly", "watch", "video"]


# Load data
youtube_data = pd.read_csv("../data/youtube_data.csv")  # Title, Views, Likes, CategoryId, Published Date, Channel Name, Id
df_description_tags = pd.read_csv("../data/youtube_description_and_tags.csv")  # Tags, Description


columns_with_stop_words = ["Description"]  
columns_without_stop_words = ["Title", "Tags"]  

keyword_results = {}  

def extract_tfidf(df, max_features=20):
    """Extracts TF-IDF keywords for each column in the dataframe."""
    

    for col in df.columns:
        if col not in columns_with_stop_words + columns_without_stop_words:
            continue  # Skip irrelevant columns

        text_data = df[col].dropna().tolist()  # Remove NaNs

        if not text_data:  # If column is empty, skip it
            keyword_results[col] = []
            continue

        stop_words = custom_stop_words if col in columns_with_stop_words else None


        max_feats = 50 if col in columns_with_stop_words else 20


        vectorizer = TfidfVectorizer(stop_words=custom_stop_words, max_features=max_feats)
        tfidf_matrix = vectorizer.fit_transform(text_data)
        keywords = vectorizer.get_feature_names_out()

        keyword_results[col] = keywords  # Store keywords for this column

    return keyword_results  

print("\n Processing Video Data:")
video_keywords = extract_tfidf(youtube_data)

print("\n Processing Description & Tags Data:")
description_tags_keywords = extract_tfidf(df_description_tags)

# Print results
print("\n Title Keywords:", video_keywords.get("Title"))
print("\n Description Keywords:", description_tags_keywords.get("Description"))
print("\n Tags Keywords:", description_tags_keywords.get("Tags"))



 Processing Video Data:

 Processing Description & Tags Data:

 Title Keywords: ['12' 'advice' 'best' 'doing' 'future' 'jordan' 'life' 'matter' 'meaning'
 'monster' 'motivation' 'motivational' 'negative' 'peterson' 'reminder'
 'rules' 'screwed' 'speeches' 'treat' 'worth']

 Description Keywords: ['30' 'adventure' 'advice' 'amzn' 'books' 'chriswillx' 'contact' 'content'
 'discount' 'don' 'educational' 'free' 'going' 'idea' 'instagram' 'jordan'
 'jordanbpeterson' 'jordanpetersonvideos' 'just' 'know' 'life' 'like'
 'modernwisdom' 'motivational' 'mulliganbrothers' 'order' 'owner'
 'patreon' 'people' 'peterson' 'purpose' 'really' 'rules' 'say' 'share'
 'speaker' 'tell' 'thing' 'things' 'think' 'try' 'twitter' 'use' 'used'
 'user' 've' 'videos' 'want' 'way' 'wordtothewise']

 Tags Keywords: ['2018' 'advice' 'best' 'chris' 'interview' 'jordan' 'life' 'modern'
 'motivation' 'motivational' 'peterson' 'podcast' 'powerful' 'productive'
 'rules' 'self' 'speech' 'success' 'williamson' 'wisdom']


##### My output was initially:


 Processing Video Data:

 Processing Description & Tags Data:

 Title Keywords: ['12' '4k' 'advice' 'amazing' 'anything' 'best' 'change' 'choice'
 'control' 'discipline' 'doing' 'jordan' 'life' 'motivation' 'peterson'
 'to' 'worth' 'you' 'your' 'yourself']

 Description Keywords: ['30' 'advice' 'amzn' 'bit' 'chriswillx' 'com' 'contact' 'content' 'don'
 'educational' 'free' 'http' 'https' 'idea' 'instagram' 'jordan'
 'jordanbpeterson' 'just' 'know' 'life' 'like' 'ly' 'modernwisdom'
 'motivational' 'order' 'owner' 'patreon' 'people' 'peterson' 'purpose'
 'really' 'rules' 'say' 'tell' 'thing' 'things' 'think' 'try' 'twitter'
 'used' 've' 'video' 'videos' 'want' 'watch' 'way' 'wordtothewise' 'www'
 'youtu' 'youtube']

 Tags Keywords: ['2018' 'advice' 'be' 'best' 'chris' 'for' 'jordan' 'life' 'modern'
 'motivation' 'motivational' 'peterson' 'podcast' 'powerful' 'speech' 'to'
 'video' 'williamson' 'wisdom' 'your']

### ----> So I added custom stop words

Note: Sentiment analysis will not be needed for comments

In [3]:
print(keyword_results)

{'Title': array(['12', 'advice', 'best', 'doing', 'future', 'jordan', 'life',
       'matter', 'meaning', 'monster', 'motivation', 'motivational',
       'negative', 'peterson', 'reminder', 'rules', 'screwed', 'speeches',
       'treat', 'worth'], dtype=object), 'Tags': array(['2018', 'advice', 'best', 'chris', 'interview', 'jordan', 'life',
       'modern', 'motivation', 'motivational', 'peterson', 'podcast',
       'powerful', 'productive', 'rules', 'self', 'speech', 'success',
       'williamson', 'wisdom'], dtype=object), 'Description': array(['30', 'adventure', 'advice', 'amzn', 'books', 'chriswillx',
       'contact', 'content', 'discount', 'don', 'educational', 'free',
       'going', 'idea', 'instagram', 'jordan', 'jordanbpeterson',
       'jordanpetersonvideos', 'just', 'know', 'life', 'like',
       'modernwisdom', 'motivational', 'mulliganbrothers', 'order',
       'owner', 'patreon', 'people', 'peterson', 'purpose', 'really',
       'rules', 'say', 'share', 'speaker', 'tell

In [7]:
keyword_df = pd.DataFrame.from_dict(keyword_results, orient='index').transpose()

In [11]:
print(keyword_df)

           Title          Tags           Description
0             12          2018                    30
1         advice        advice             adventure
2           best          best                advice
3          doing         chris                  amzn
4         future     interview                 books
5         jordan        jordan            chriswillx
6           life          life               contact
7         matter        modern               content
8        meaning    motivation              discount
9        monster  motivational                   don
10    motivation      peterson           educational
11  motivational       podcast                  free
12      negative      powerful                 going
13      peterson    productive                  idea
14      reminder         rules             instagram
15         rules          self                jordan
16       screwed        speech       jordanbpeterson
17      speeches       success  jordanpeterson

In [19]:
keyword_df_filled = keyword_df.fillna('N/A')

print(keyword_df)

           Title          Tags           Description
0             12          2018                    30
1         advice        advice             adventure
2           best          best                advice
3          doing         chris                  amzn
4         future     interview                 books
5         jordan        jordan            chriswillx
6           life          life               contact
7         matter        modern               content
8        meaning    motivation              discount
9        monster  motivational                   don
10    motivation      peterson           educational
11  motivational       podcast                  free
12      negative      powerful                 going
13      peterson    productive                  idea
14      reminder         rules             instagram
15         rules          self                jordan
16       screwed        speech       jordanbpeterson
17      speeches       success  jordanpeterson

In [17]:
keyword_df.to_csv("../data/keywords_tags_titles_descriptions.csv", index=False)