In [3]:
import os
import sys
import pandas as pd

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

from src.youtube_api import YouTubeAPI
from src.data_preprocessor import DataPreprocessor

def get_video_ids():
    video_ids = input("Enter YouTube video IDs separated by commas: ")
    return [vid.strip() for vid in video_ids.split(',')]

video_ids = get_video_ids()
yt_api = YouTubeAPI()
videos_df = yt_api.get_video_details(video_ids)

print("Fetched Data:")
print(videos_df.head())

preprocessor = DataPreprocessor(videos_df)
cleaned_data = preprocessor.clean_data()

feature_data = preprocessor.extract_features()

print("Processed Data:")
print(feature_data.head())

output_path = '../data/processed_videos.csv'
preprocessor.save_data(output_path)

print(f"Processed data saved to {output_path}")


Enter YouTube video IDs separated by commas: cvQEZJtfNDE , 0fStWP79Z5A
Fetched Data:
      video_id                         title  \
0  cvQEZJtfNDE  Saali Left College For Pizza   
1  0fStWP79Z5A  Gryffin (DJ Set) - Kauai, HI   

                                         description  \
0  Klaudia is here, and today Sylwia decided to m...   
1  Listen to my new song "Dreams" here! https://g...   

                                                tags category_id  view_count  \
0  [Indian, food vlogs, family vlogs, family, dai...          22        4647   
1                                                 []          10     1267484   

   like_count  comment_count  
0         763             55  
1       19349            843  
Processed Data:
      video_id                         title  \
0  cvQEZJtfNDE  Saali Left College For Pizza   
1  0fStWP79Z5A  Gryffin (DJ Set) - Kauai, HI   

                                         description  \
0  Klaudia is here, and today Sylwia decided to m.

In [4]:
# notebooks/data_preprocessing.ipynb

# Import necessary libraries
import os
import sys
import pandas as pd

# Add the parent directory to the system path to import modules from src
sys.path.append(os.path.abspath(os.path.join('..')))

from src.youtube_api import YouTubeAPI
from src.data_preprocessor import DataPreprocessor

# Function to take video IDs dynamically
def get_video_ids():
    video_ids = input("Enter YouTube video IDs separated by commas: ")
    return [vid.strip() for vid in video_ids.split(',')]

# Initialize YouTube API and fetch video details
video_ids = get_video_ids()  # Get video IDs dynamically from user input
yt_api = YouTubeAPI()
videos_df = yt_api.get_video_details(video_ids)

# Display fetched data
print("Fetched Data:")
print(videos_df.head())

# Initialize DataPreprocessor and clean data
preprocessor = DataPreprocessor(videos_df)
cleaned_data = preprocessor.clean_data()

# Transform text data (apply TF-IDF)
transformed_data = preprocessor.transform_text_data()

# Extract other features
feature_data = preprocessor.extract_features()

# Display processed data
print("Processed Data:")
print(transformed_data.head())

# Save processed data to a CSV file
output_path = '../data/processed_videos.csv'
preprocessor.save_data(output_path)

print(f"Processed data saved to {output_path}")


Enter YouTube video IDs separated by commas: khL1Gkp6vnM, kj14Tk8UZRc, ukzFI9rgwfU, GwIo3gDZCVQ, LvC68w9JS4Y, bmmQA8A-yUA, i_LwzRVP7bg, GwIo3gDZCVQ&t=93s, i_LwzRVP7bg&t=25s, LvC68w9JS4Y&t=34s, 7IgVGSaQPaw, 7eh4d6sabA0, 4RixMPF4xis
Fetched Data:
      video_id                                              title  \
0  khL1Gkp6vnM  3rd ODI | Hindi | Highlights | India Tour Of S...   
1  kj14Tk8UZRc  Aan-Men At Work (HD) | Bollywood Action Movie ...   
2  ukzFI9rgwfU  Machine Learning | What Is Machine Learning? |...   
3  GwIo3gDZCVQ  Machine Learning Full Course - Learn Machine L...   
4  LvC68w9JS4Y  Machine Learning FULL Course with Practical (1...   

                                         description  \
0  Click here to subscribe to Sony LIV Channel: h...   
1  DCP Hari Om Patnaik, along with his reformed f...   
2  ðŸ”¥Professional Certificate Course In AI And Mac...   
3  ðŸ”¥ Machine Learning Engineer Masters Program (U...   
4  Machine Learning Full Course for Beginners (20...  

In [7]:
# notebooks/data_preprocessing.ipynb

import pandas as pd
from src.youtube_api import YouTubeAPI
from src.data_preprocessor import DataPreprocessor

# Function to take video IDs dynamically
def get_video_ids():
    video_ids = input("Enter YouTube video IDs separated by commas: ")
    return [vid.strip() for vid in video_ids.split(',')]

# Initialize YouTube API and fetch video details
video_ids = get_video_ids()  # Get video IDs dynamically from user input
yt_api = YouTubeAPI()
videos_df = yt_api.get_video_details(video_ids)

# Display fetched data
print("Fetched Data:")
print(videos_df.head())
print("Columns in DataFrame:", videos_df.columns)

# Initialize DataPreprocessor and clean data
preprocessor = DataPreprocessor(videos_df)
cleaned_data = preprocessor.clean_data()

# Check if 'description' column exists and print its data type
print("Data types before TF-IDF transformation:")
print(cleaned_data.dtypes)

# Process text data (TF-IDF transformation)
try:
    transformed_data = preprocessor.transform_text_data()
    print("TF-IDF transformation successful.")
except KeyError as e:
    print(f"KeyError: {e} - Check if 'description' column is correctly referenced.")
    print("Available columns:", cleaned_data.columns)

# Extract relevant numeric features
feature_data = preprocessor.extract_features()

# Check data types to confirm that no strings remain where floats/ints are expected
print("Data types after feature extraction:")
print(feature_data.dtypes)


Enter YouTube video IDs separated by commas: khL1Gkp6vnM, kj14Tk8UZRc, ukzFI9rgwfU, GwIo3gDZCVQ, LvC68w9JS4Y, bmmQA8A-yUA, i_LwzRVP7bg, GwIo3gDZCVQ&t=93s, i_LwzRVP7bg&t=25s, LvC68w9JS4Y&t=34s, 7IgVGSaQPaw, 7eh4d6sabA0, 4RixMPF4xis, dcXqhMqhZUo
Fetched Data:
      video_id                                              title  \
0  khL1Gkp6vnM  3rd ODI | Hindi | Highlights | India Tour Of S...   
1  kj14Tk8UZRc  Aan-Men At Work (HD) | Bollywood Action Movie ...   
2  ukzFI9rgwfU  Machine Learning | What Is Machine Learning? |...   
3  GwIo3gDZCVQ  Machine Learning Full Course - Learn Machine L...   
4  LvC68w9JS4Y  Machine Learning FULL Course with Practical (1...   

                                         description  \
0  Click here to subscribe to Sony LIV Channel: h...   
1  DCP Hari Om Patnaik, along with his reformed f...   
2  ðŸ”¥Professional Certificate Course In AI And Mac...   
3  ðŸ”¥ Machine Learning Engineer Masters Program (U...   
4  Machine Learning Full Course for Begin

In [8]:
print("Columns in DataFrame:")
print(self.videos_df.columns)


Columns in DataFrame:


NameError: name 'self' is not defined