# **1. Import Libraries and Read CSVs**
> - import general, visualization, nlp, and machine learning modules
> - read US YouTube Trending Data; <br> a dataset containing videos that trended in the United States

In [None]:
# General
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import pyLDAvis
import pyLDAvis.lda_model

# Text Preprocessing
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Sentiment Analysis & Annotation
from textblob import TextBlob

# ML Model Requisites
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# ML Model Topic Clustering
from sklearn.decomposition import LatentDirichletAllocation

# ML Model Pipeline for Sentiment Analysis & Annotation
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# ML Model Evaluation
from sklearn.metrics import classification_report #this prints accuracy, precision & recall
from sklearn.metrics import confusion_matrix

# ML Model Hypertuning
from sklearn.model_selection import GridSearchCV

# Text Classification (Sentiment Analysis & Annotation) via ChatGPT
# Open API key: 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
import os
import getpass
import openai
from skllm.config import SKLLMConfig
from skllm import ZeroShotGPTClassifier
from skllm import MultiLabelZeroShotGPTClassifier
from skllm import FewShotGPTClassifier
from skllm import DynamicFewShotGPTClassifier

# Text Summarization
from skllm.preprocessing import GPTSummarizer

# Text Translation
from skllm.preprocessing import GPTTranslator
from langdetect import detect

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
df_concat = pd.read_csv("US_youtube_trending_data_full.csv")

# **2. Data Preparation**
> - series of EDA and Data Cleaning procedures

## **EDA 1**
> - get general information on the dataset







In [None]:
# 1. Copy df_concat to df and display info
df = df_concat.copy(deep=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237187 entries, 0 to 237186
Data columns (total 17 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Unnamed: 0         237187 non-null  int64 
 1   video_id           237187 non-null  object
 2   title              237187 non-null  object
 3   publishedAt        237187 non-null  object
 4   channelId          237187 non-null  object
 5   channelTitle       237187 non-null  object
 6   categoryId         237187 non-null  int64 
 7   trending_date      237187 non-null  object
 8   tags               237187 non-null  object
 9   view_count         237187 non-null  int64 
 10  likes              237187 non-null  int64 
 11  dislikes           237187 non-null  int64 
 12  comment_count      237187 non-null  int64 
 13  thumbnail_link     237187 non-null  object
 14  comments_disabled  237187 non-null  bool  
 15  ratings_disabled   237187 non-null  bool  
 16  description        2

## **Data Cleaning 1**
> - rename columns
> - map category labels
> - rearrange columns
> - convert columns to appropriate dtype
> - drop columns

In [None]:
# 1. Rename Columns
column_rename_mapping = {
    'title': 'video_title',
    'publishedAt': 'publish_date',
    'channelId': 'channel_id',
    'channelTitle': 'channel_name',
    'categoryId': 'category_id',
    'tags': 'video_tags',
    'view_count': 'views',
    'comment_count': 'comments',
    'description': 'video_description'
}
df.rename(columns=column_rename_mapping, inplace=True)

In [None]:
# 2. Map category_id with appropriate label
category_id_mapping = {
    1: 'Film & Animation',
    2: 'Autos & Vehicles',
    10: 'Music',
    15: 'Pets & Animals',
    17: 'Sports',
    18: 'Short Movies',
    19: 'Travel & Events',
    20: 'Gaming',
    21: 'Videoblogging',
    22: 'People & Blogs',
    23: 'Comedy',
    24: 'Entertainment',
    25: 'News & Politics',
    26: 'Howto & Style',
    27: 'Education',
    28: 'Science & Technology',
    29: 'Nonprofits & Activism',
    30: 'Movies',
    31: 'Anime/Animation',
    32: 'Action/Adventure',
    33: 'Classics',
    34: 'Comedy',
    35: 'Documentary',
    36: 'Drama',
    37: 'Family',
    38: 'Foreign',
    39: 'Horror',
    40: 'Sci-Fi/Fantasy',
    41: 'Thriller',
    42: 'Shorts',
    43: 'Shows',
    44: 'Trailers'
}
df['category_name'] = df['category_id'].map(category_id_mapping)

In [None]:
# 3. Rearrange Columns
desired_column_order = [
    'video_id', 'video_title', 'video_tags', 'video_description',
    'views', 'likes', 'dislikes', 'comments',
    'category_id', 'category_name',
    'channel_id', 'channel_name',
    'publish_date', 'trending_date',
    'thumbnail_link', 'comments_disabled', 'ratings_disabled'
]
df = df[desired_column_order]

In [None]:
# 4. Convert Columns to appropriate dtype
df['video_id'] = df['video_id'].astype(str)
df['video_title'] = df['video_title'].astype(str)
df['video_tags'] = df['video_tags'].astype(str)
df['video_description'] = df['video_description'].astype(str)
df['category_id'] = df['category_id'].astype(str)
df['category_name'] = df['category_name'].astype(str)
df['channel_id'] = df['channel_id'].astype(str)
df['channel_name'] = df['channel_name'].astype(str)
df['publish_date'] = pd.to_datetime(df['publish_date'])
df['trending_date'] = pd.to_datetime(df['trending_date'])

In [None]:
# 5. Drop Columns
df = df.drop(columns=['likes', 'dislikes', 'comments', 'category_id', 'thumbnail_link', 'comments_disabled', 'ratings_disabled'])

In [None]:
# 6. Copy df to df_cleaned1
df_cleaned1 = df.copy(deep=True)

## **EDA 2**
> - get total number of unique Videos
> - get total number of unique videos per category
> - compare total number of unique videos vs. sum of total number of unique videos per category <br> to see if some videos are categorized differently on different trending dates

In [None]:
# 1. Copy df_cleaned1 to df and display info
df = df_cleaned1.copy(deep=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 237187 entries, 0 to 237186
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   video_id           237187 non-null  object             
 1   video_title        237187 non-null  object             
 2   video_tags         237187 non-null  object             
 3   video_description  237187 non-null  object             
 4   views              237187 non-null  int64              
 5   category_name      237187 non-null  object             
 6   channel_id         237187 non-null  object             
 7   channel_name       237187 non-null  object             
 8   publish_date       237187 non-null  datetime64[ns, UTC]
 9   trending_date      237187 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), int64(1), object(7)
memory usage: 18.1+ MB


In [None]:
# 2. Get Total Number of Unique Videos
nunique_video = df['video_id'].nunique()
nunique_video

42088

In [None]:
# 3. Get Total Number of Unique Videos per Category
nunique_video_per_category = df.groupby('category_name')['video_id'].nunique().reset_index().sort_values(by='video_id', ascending=False).reset_index(drop=True)
nunique_video_per_category

Unnamed: 0,category_name,video_id
0,Gaming,8399
1,Entertainment,8197
2,Music,6678
3,Sports,5392
4,People & Blogs,3624
5,Comedy,2052
6,Film & Animation,1630
7,News & Politics,1523
8,Science & Technology,1268
9,Howto & Style,1086


In [None]:
# 4. Compare Total Number of Unique Videos vs. Sum of Total Number of Unique Videos per Category to see if some videos are categorized differently on different trending dates
print(f'                      Total Number of Unique Video: {nunique_video}')
print(f' Sum of Total Number of Unique Videos per Category: {nunique_video_per_category.video_id.sum()}\n')

                      Total Number of Unique Video: 42088
 Sum of Total Number of Unique Videos per Category: 42128



## **Data Cleaning 2**
> - determine videos that have inconsistent category name assignments on different trending dates <br> and exclude them from the dataset

In [None]:
# 1. Determine videos that have inconsistent category name assignments on different trending dates and exclude them from the dataset

# Determine video_id with multiple category_name assignments
video_id_multiple_categories = df.groupby('video_id')['category_name'].nunique()
video_id_multiple_categories = video_id_multiple_categories[video_id_multiple_categories > 1].index
videos_with_multiple_categories = df[df['video_id'].isin(video_id_multiple_categories)]
unique_video_ids_different_categories = videos_with_multiple_categories['video_id'].unique()
unique_video_ids_different_categories

# Exclude video_id found in unique_video_ids_different_categories
df = df[~df['video_id'].isin(unique_video_ids_different_categories)]

In [None]:
# 2. Copy df to df_cleaned2
df_cleaned2 = df.copy(deep=True)

## **EDA 3**
> - get total number of unique Videos
> - get total number of unique Videos per category
> - compare total number of unique videos vs. sum of total number of unique videos per category <br> to see if videos that have inconsistent category name assignments are excluded

In [None]:
# 1. Copy df_cleaned2 to df and display info
df = df_cleaned2.copy(deep=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 234330 entries, 0 to 237186
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   video_id           234330 non-null  object             
 1   video_title        234330 non-null  object             
 2   video_tags         234330 non-null  object             
 3   video_description  234330 non-null  object             
 4   views              234330 non-null  int64              
 5   category_name      234330 non-null  object             
 6   channel_id         234330 non-null  object             
 7   channel_name       234330 non-null  object             
 8   publish_date       234330 non-null  datetime64[ns, UTC]
 9   trending_date      234330 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), int64(1), object(7)
memory usage: 19.7+ MB


In [None]:
# 2. Get Total Number of Unique Videos
nunique_video = df['video_id'].nunique()
nunique_video

42060

In [None]:
# 3. Get Total Number of Unique Videos per Category
nunique_video_per_category = df.groupby('category_name')['video_id'].nunique().reset_index().sort_values(by='video_id', ascending=False).reset_index(drop=True)
nunique_video_per_category

Unnamed: 0,category_name,video_id
0,Gaming,8395
1,Entertainment,8185
2,Music,6664
3,Sports,5386
4,People & Blogs,3616
5,Comedy,2048
6,Film & Animation,1626
7,News & Politics,1519
8,Science & Technology,1265
9,Howto & Style,1083


In [None]:
# 4. Compare Total Number of Unique Videos vs. Sum of Total Number of Unique Videos per Category to see if videos that have inconsistent category name assignments are excluded
print(f'                      Total Number of Unique Video: {nunique_video}')
print(f' Sum of Total Number of Unique Videos per Category: {nunique_video_per_category.video_id.sum()}\n')

                      Total Number of Unique Video: 42060
 Sum of Total Number of Unique Videos per Category: 42060



## **EDA 4**

> - sort by trending date then remove duplicates of each unique video <br> and keep the first occurence with the earliest trending date
> - get number of days between the trending_date day and publish_date day inclusive of the trending_date


In [None]:
# 1. Copy df_cleaned2 to df and display info
df = df_cleaned2.copy(deep=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 234330 entries, 0 to 237186
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   video_id           234330 non-null  object             
 1   video_title        234330 non-null  object             
 2   video_tags         234330 non-null  object             
 3   video_description  234330 non-null  object             
 4   views              234330 non-null  int64              
 5   category_name      234330 non-null  object             
 6   channel_id         234330 non-null  object             
 7   channel_name       234330 non-null  object             
 8   publish_date       234330 non-null  datetime64[ns, UTC]
 9   trending_date      234330 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](2), int64(1), object(7)
memory usage: 19.7+ MB


In [None]:
# 2. Sort by trending_date then Remove duplicate rows based on 'video_id' and keep the first occurrence with the earliest trending_date
df = df.sort_values(by='trending_date')
df = df.drop_duplicates(subset='video_id', keep='first')

In [None]:
# 3. Get Number of Days between the trending_date day and publish_date day inclusive of the trending_date
df['days_to_trend'] = (df['trending_date'] - df['publish_date']).dt.days + 1

In [None]:
# 4. Copy df to df_eda4
df_eda4 = df.copy(deep=True)

## **EDA 5**
> - get total number of unique videos per category
> - pick a category to focus on

In [None]:
# 1. Copy df_eda4 to df and display info
df = df_eda4.copy(deep=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42060 entries, 0 to 237056
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype              
---  ------             --------------  -----              
 0   video_id           42060 non-null  object             
 1   video_title        42060 non-null  object             
 2   video_tags         42060 non-null  object             
 3   video_description  42060 non-null  object             
 4   views              42060 non-null  int64              
 5   category_name      42060 non-null  object             
 6   channel_id         42060 non-null  object             
 7   channel_name       42060 non-null  object             
 8   publish_date       42060 non-null  datetime64[ns, UTC]
 9   trending_date      42060 non-null  datetime64[ns, UTC]
 10  days_to_trend      42060 non-null  int64              
dtypes: datetime64[ns, UTC](2), int64(2), object(7)
memory usage: 3.9+ MB


In [None]:
# 2. Get Total Number of Unique Videos per Category
df['category_name'].value_counts()

category_name
Gaming                   8395
Entertainment            8185
Music                    6664
Sports                   5386
People & Blogs           3616
Comedy                   2048
Film & Animation         1626
News & Politics          1519
Science & Technology     1265
Howto & Style            1083
Education                 996
Autos & Vehicles          832
Travel & Events           241
Pets & Animals            185
Nonprofits & Activism      19
Name: count, dtype: int64

In [None]:
# 3. Pick a Category to focus on
df = df[df['category_name']=='People & Blogs']

In [None]:
# 4. How long does it take for a gaming content to trend?
# NOTE: a People & Blogs content trends within 5 days
df['days_to_trend'].value_counts().sort_values(ascending=False)

days_to_trend
2     1686
1     1684
3      133
0       40
4       33
5       17
6        6
27       3
9        3
7        2
13       2
10       2
11       2
12       1
15       1
23       1
Name: count, dtype: int64

In [None]:
# 5. Copy df to df_eda5
df_eda5 = df.copy(deep=True)

In [None]:
df.head()

Unnamed: 0,video_id,video_title,video_tags,video_description,views,category_name,channel_id,channel_name,publish_date,trending_date,days_to_trend
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,brawadis|prank|basketball|skits|ghost|funny vi...,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...,1514614,People & Blogs,UCvtRTOMP2TqYqu51xNrqAzg,Brawadis,2020-08-11 19:20:14+00:00,2020-08-12 00:00:00+00:00,1
132,EukpxRuDUzA,Prepping My Apartment for My Boyfriend to Move...,blind|blind girl|retinitis pigmentosa|motivati...,Go to Squarespace.com/mollyburke for 10% off y...,379425,People & Blogs,UCwf9TcLyS5KDoLRLjke41Hg,Molly Burke,2020-08-08 19:30:03+00:00,2020-08-12 00:00:00+00:00,4
134,Idp1xvmLPHo,LaBrant Family Baby Boy Name Reveal.,cole and sav|the labrant fam|labrant fam|the l...,Please continue to keep our baby in your praye...,5849640,People & Blogs,UC4-CH0epzZpD_ARhxCx6LaQ,The LaBrant Fam,2020-08-08 17:26:41+00:00,2020-08-12 00:00:00+00:00,4
145,Of_C9g9O9KY,BABY BOY IS HERE! *OFFICIAL LABOR AND DELIVERY*,kyler and mad|taytum and oakley|the fishfam|ne...,We are thrilled! Our baby boy is here! This is...,1577287,People & Blogs,UCJTyunmsBLj20wyguh6uMig,The Fishfam,2020-08-07 18:30:06+00:00,2020-08-12 00:00:00+00:00,5
110,qn515i8xr-4,Surprising Thomas with a Motorcycle!!,surprise|friend surprise|best friend|best frie...,Thank you to Indian Motorcycle for helping Amm...,176552,People & Blogs,UCTd7KzdwnFE3lm6LCfYDmUQ,Yes Theory PLUS,2020-08-09 16:00:13+00:00,2020-08-12 00:00:00+00:00,3


In [None]:
len(df)

3616

In [None]:
df.to_csv("main_df.csv", sep="|")

# **3. NLP**

In [None]:
# 1. Copy df_eda5 to df and display info
df = df_eda5.copy(deep=True)
df.info()

## scratch

In [None]:
##### D3N1
titles = df['video_title'].str.cat(sep=' ')

tokens = nltk.word_tokenize(titles)
tokens = [word.lower() for word in tokens if word not in stopwords.words('english')
          and word.isalpha()] # word not in string.punctuation

bigrams = list(nltk.ngrams(tokens, 2))
bigram_counts = nltk.FreqDist(bigrams)

print("\nTop 30 bigrams by frequency\n")
for (word1, word2), freq in bigram_counts.most_common(30):
    print(f"{word1} {word2}: {freq}")

In [None]:
##### D2N1
top_30_bigrams = bigram_counts.most_common(30)

bigram_words = [f"{word1} {word2}" for (word1, word2), freq in top_30_bigrams]
bigram_frequencies = [freq for (word1, word2), freq in top_30_bigrams]

plt.figure(figsize=(10, 6))
plt.barh(bigram_words, bigram_frequencies, color='skyblue')
plt.xlabel('Frequency')
plt.ylabel('Bigrams')
plt.title('People & Blogs: Top 30 Bigrams by Frequency')
plt.gca().invert_yaxis()
plt.show()

In [None]:
##### D2N1
bigram_dict = {f'{bigram[0]} {bigram[1]}': count
               for bigram, count in bigram_counts.items()}

wordcloud = WordCloud(width=800, height=800,
                      background_color='white',
                      stopwords=set(stopwords.words('english')),
                      min_font_size=10).generate_from_frequencies(bigram_dict)


plt.figure(figsize = (6, 6), facecolor = None)
plt.imshow(wordcloud)
plt.title('People & Blogs: Top 30 Bigrams by Frequency')
plt.axis("off")
plt.tight_layout(pad = 0)
plt.show()

# How does engagement change over time? Does it have some kind of seasonality?

In [None]:
df.head(5)

In [None]:
df_ts = df[["trending_date", "views"]]
df_ts.head()

In [None]:
df_ts.index = df["trending_date"]
df_ts.drop(columns=["trending_date"], inplace=True)
df_ts

In [None]:
df_ts = df_ts.resample("D").sum()
df_ts = df_ts[df_ts["views"] > 0]
df_ts

In [None]:
df_ts.plot.line()

In [None]:
df_ts_desc = df_ts.sort_values("views", ascending=False)

In [None]:
df_ts_desc.head(20)

In [None]:
df_ts_peaks = df_ts_desc[df_ts_desc["views"] > 30000000]
df_ts_peaks

In [None]:
df_ts_peaks.index.date.tolist()

In [None]:
df["trending_date_only"] = df['trending_date'].dt.date
df.head(5)

In [None]:
df_ts_peak_days = df[df["trending_date_only"].isin(df_ts_peaks.index.date.tolist())]
df_ts_peak_days

In [None]:
df_ts_peak_days["views"].value_counts()

In [None]:
peak_trending_days = df_ts_peak_days["views"].value_counts().index.tolist()
peak_trending_days

In [None]:
print(sum(peak_trending_days) / len(peak_trending_days))

# How do tags behave over time. Are there certain tags that are popular in certain periods?

In [None]:
df_ts_tags = df[["trending_date", "views", "video_tags"]]
df_ts_tags = df_ts_tags[df_ts_tags["video_tags"] != "[None]"]
df_ts_tags

In [None]:
df_ts_tags_concat = df_ts_tags.copy()
df_ts_tags_concat.index = df_ts_tags["trending_date"]
df_ts_tags_concat.drop(columns=["trending_date"], inplace=True)
df_ts_tags_concat

df_ts_tags_concat = df_ts_tags_concat.resample("D").agg({"views": sum, "video_tags": '|'.join})
df_ts_tags_concat = df_ts_tags_concat[df_ts_tags_concat["views"] > 0]
df_ts_tags_concat

In [None]:
df_ts_tags_concat["video_tags"] = df_ts_tags_concat["video_tags"].str.replace("|", " ")
df_ts_tags_concat["video_tags"]

In [None]:
wordcloud = WordCloud().generate(str(df_ts_tags_concat["video_tags"].values))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
df_ts_tags_concat.index = df_ts_tags_concat.index.date
df_ts_tags_concat

In [None]:
#sample = df_ts_tags_concat.iloc[0, :]["video_tags"]
#print(sample)

#wordcloud = WordCloud().generate(sample)
#plt.imshow(wordcloud)
#plt.axis("off")
#plt.show()

for i in range(0, 10):
    wordcloud = WordCloud().generate(str(df_ts_tags_concat.iloc[i, :]["video_tags"]))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()


# Create groups where we can pull out samples from for the LLM

In [None]:
df_lda = df[df["video_tags"] != "[None]"]
df_lda = df_lda["video_tags"].str.replace("|", " ")
df_lda

In [None]:
from nltk.tokenize import RegexpTokenizer

In [None]:
documents_list = df_lda.values.tolist()
documents_list

In [None]:
len(documents_list)

In [None]:
# Initialize regex tokenizer
tokenizer = RegexpTokenizer(r'\w+')

# Vectorize document using TF-IDF
tfidf = TfidfVectorizer(lowercase=True,
                        stop_words='english',
                        ngram_range = (1,1),
                        tokenizer = tokenizer.tokenize)

# Fit and Transform the documents
train_data = tfidf.fit_transform(documents_list)

In [None]:
train_data

In [None]:
num_components=10

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components, random_state=0)

# Fit and Transform SVD model on data
model.fit(train_data)
lda_matrix = model.transform(train_data)

# Get Components
lda_components=model.components_

In [None]:
# Print the topics with their terms
terms = tfidf.get_feature_names_out()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

In [None]:
lda_matrix

In [None]:
lda_matrix_df = pd.DataFrame(lda_matrix)
lda_matrix_df

In [None]:
lda_matrix_df["Topic"] = lda_matrix_df.idxmax(axis=1)
lda_matrix_df

In [None]:
lda_matrix_df["Topic"].value_counts()

In [None]:
model.score(train_data)

In [None]:
model.perplexity(train_data)

In [None]:
n_topics = [2, 3, 4, 5, 10, 15, 20, 25]
for i in n_topics:
    model=LatentDirichletAllocation(n_components=i, random_state=0)

    # Fit and Transform SVD model on data
    model.fit(train_data)
    print(i)
    print(model.score(train_data))
    print(model.perplexity(train_data))

### Optimal n_components

In [None]:
num_components=10

# Create LDA object
model=LatentDirichletAllocation(n_components=num_components, random_state=0)

# Fit and Transform SVD model on data
model.fit(train_data)
lda_matrix = model.transform(train_data)

# Get Components
lda_components=model.components_

# Print the topics with their terms
terms = tfidf.get_feature_names_out()

for index, component in enumerate(lda_components):
    zipped = zip(terms, component)
    top_terms_key=sorted(zipped, key = lambda t: t[1], reverse=True)[:7]
    top_terms_list=list(dict(top_terms_key).keys())
    print("Topic "+str(index)+": ",top_terms_list)

lda_matrix_df = pd.DataFrame(lda_matrix)
lda_matrix_df["Tag Topic"] = lda_matrix_df.idxmax(axis=1)

In [None]:
lda_matrix_df.index = df_lda.index

In [None]:
len(df_lda)

In [None]:
df_lda.head()

In [None]:
len(lda_matrix_df)

In [None]:
lda_matrix_df.head()

In [None]:
lda_tag_topics = df_lda.copy()
lda_tag_topics

In [None]:
lda_tag_topics["Tag Topic"] = lda_matrix_df["Tag Topic"].values

In [None]:
len(lda_tag_topics)

In [None]:
lda_tag_topics.head()

In [None]:
lda_tag_topics.to_csv("lda_tag_topics.csv")

In [None]:
lda_matrix_df.to_csv("lda_tag_matrix.csv")

### LDA for title