In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# 1. Load the data
file_name = "G:\\infosys_internship\\yt_embedding\\cleaned_youtube_details.csv"
df = pd.read_csv(file_name)

# 2. Prepare and Combine Text
# Fill missing transcripts (NaN) with an empty string to prevent errors
df['transcript'] = df['transcript'].fillna('')

# Combine 'title' and 'transcript' into a new column 'combined_text'
# A [SEP] (separator) token is added for clarity between the two text fields
df['combined_text'] = df['title'].astype(str) + " [SEP] " + df['transcript'].astype(str)

# 3. Generate Embeddings using TF-IDF and Truncated SVD
# Step 3a: Initialize TfidfVectorizer
# max_features is set to 5000 to limit the vocabulary size for efficiency
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['combined_text'])

# Step 3b: Reduce Dimensionality using Truncated SVD
# We reduce the feature space to 100 dimensions for our embedding vector
n_components = 100
svd_model = TruncatedSVD(n_components=n_components, random_state=42)
embedding_matrix = svd_model.fit_transform(tfidf_matrix)

# 4. Save the Embedding as a New Column
# Convert the NumPy array of vectors into a list-of-lists and store it in the DataFrame
df['text_embedding'] = [list(vec) for vec in embedding_matrix]

# 5. Save the Updated DataFrame to CSV
output_file_name = "G:\\infosys_internship\\yt_embedding\\youtube_details_with_embeddings.csv"
df.to_csv(output_file_name, index=False)