In [31]:

# Import necessary libraries
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from data_preparation import load_and_clean_data
from labeling_and_prompt import create_labels
from feature_extraction import extract_features_bert
from model_training import train_advanced_model, evaluate_model

# Set device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load and clean data from multiple files
file_paths = [
    r"C:\Users\nates\Youtube-app\backend\scraper\output\24.28.06_GB_videos.csv",
    r"C:\Users\nates\Youtube-app\backend\scraper\output\24.28.06_US_videos.csv"
]

data = pd.concat([load_and_clean_data(file_path) for file_path in file_paths], ignore_index=True)

# Display the first few rows of the data to verify it loaded correctly
print(data.head())


      video_id                  comment_id                    comment_text  \
0  hbcGx4MGUMg  Ugy9ifkobHcfAlzB63J4AaABAg               Lets go M BLNKSSS   
1  hbcGx4MGUMg  UgxLAtkHCtZkWVWtvll4AaABAg  The beat is giving descendants   
2  hbcGx4MGUMg  UgxmRwAsVRctJFCqoEh4AaABAg                   Im a Rockstar   
3  hbcGx4MGUMg  UgwVvwl16UV5bx6sFzp4AaABAg        she devoured this i fear   
4  hbcGx4MGUMg  Ugy0VgSzMFUnjQ2Bmgh4AaABAg                                   

               author          comment_date  \
0  @GuluMelkova-vb1jw  2024-06-28T22:03:56Z   
1     @Hope-MariePyne  2024-06-28T22:03:53Z   
2    @Mariavargas-z7h  2024-06-28T22:03:53Z   
3       @sunnyskies..  2024-06-28T22:03:49Z   
4          @edand6474  2024-06-28T22:03:47Z   

                                    title           publishedAt  \
0  LISA - ROCKSTAR (Official Music Video)  2024-06-28T00:00:07Z   
1  LISA - ROCKSTAR (Official Music Video)  2024-06-28T00:00:07Z   
2  LISA - ROCKSTAR (Official Music Video)  2024

In [32]:
# Create labels
data = create_labels(data)

# Display the first few rows to check labels
data.head()



Unnamed: 0,video_id,comment_id,comment_text,author,comment_date,title,publishedAt,channelId,channelTitle,categoryId,...,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description,label
0,hbcGx4MGUMg,Ugy9ifkobHcfAlzB63J4AaABAg,Lets go M BLNKSSS,@GuluMelkova-vb1jw,2024-06-28T22:03:56Z,LISA - ROCKSTAR (Official Music Video),2024-06-28T00:00:07Z,UC6-BgjsBa5R3PZQ_kZ8hKPg,LLOUD Official,22,...,Blackpink|Lisa|Music|Fashion|K-Pop|kpop|LLoud|...,31662719,3400959,0,316096,https://i.ytimg.com/vi/hbcGx4MGUMg/hqdefault.jpg,False,False,Stream LISA's single 'Rockstar' now: http://LI...,2
1,hbcGx4MGUMg,UgxLAtkHCtZkWVWtvll4AaABAg,The beat is giving descendants,@Hope-MariePyne,2024-06-28T22:03:53Z,LISA - ROCKSTAR (Official Music Video),2024-06-28T00:00:07Z,UC6-BgjsBa5R3PZQ_kZ8hKPg,LLOUD Official,22,...,Blackpink|Lisa|Music|Fashion|K-Pop|kpop|LLoud|...,31662719,3400959,0,316096,https://i.ytimg.com/vi/hbcGx4MGUMg/hqdefault.jpg,False,False,Stream LISA's single 'Rockstar' now: http://LI...,2
2,hbcGx4MGUMg,UgxmRwAsVRctJFCqoEh4AaABAg,Im a Rockstar,@Mariavargas-z7h,2024-06-28T22:03:53Z,LISA - ROCKSTAR (Official Music Video),2024-06-28T00:00:07Z,UC6-BgjsBa5R3PZQ_kZ8hKPg,LLOUD Official,22,...,Blackpink|Lisa|Music|Fashion|K-Pop|kpop|LLoud|...,31662719,3400959,0,316096,https://i.ytimg.com/vi/hbcGx4MGUMg/hqdefault.jpg,False,False,Stream LISA's single 'Rockstar' now: http://LI...,2
3,hbcGx4MGUMg,UgwVvwl16UV5bx6sFzp4AaABAg,she devoured this i fear,@sunnyskies..,2024-06-28T22:03:49Z,LISA - ROCKSTAR (Official Music Video),2024-06-28T00:00:07Z,UC6-BgjsBa5R3PZQ_kZ8hKPg,LLOUD Official,22,...,Blackpink|Lisa|Music|Fashion|K-Pop|kpop|LLoud|...,31662719,3400959,0,316096,https://i.ytimg.com/vi/hbcGx4MGUMg/hqdefault.jpg,False,False,Stream LISA's single 'Rockstar' now: http://LI...,2
4,hbcGx4MGUMg,Ugy0VgSzMFUnjQ2Bmgh4AaABAg,,@edand6474,2024-06-28T22:03:47Z,LISA - ROCKSTAR (Official Music Video),2024-06-28T00:00:07Z,UC6-BgjsBa5R3PZQ_kZ8hKPg,LLOUD Official,22,...,Blackpink|Lisa|Music|Fashion|K-Pop|kpop|LLoud|...,31662719,3400959,0,316096,https://i.ytimg.com/vi/hbcGx4MGUMg/hqdefault.jpg,False,False,Stream LISA's single 'Rockstar' now: http://LI...,2


In [33]:
# Split data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Extract features for training data
X_train = extract_features_bert(train_data, device=device)
y_train = torch.tensor(train_data['label'].values)

# Extract features for testing data
X_test = extract_features_bert(test_data, device=device)
y_test = torch.tensor(test_data['label'].values)

In [34]:
# Verify the shapes of the extracted features
print("Shapes before split:")
print("X_train shape:", X_train.shape)
print("train_masks shape:", train_masks.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("test_masks shape:", test_masks.shape)
print("y_test shape:", y_test.shape)

Shapes before split:


AttributeError: 'tuple' object has no attribute 'shape'

In [None]:
# Further split train data into train and validation sets
X_train, X_val, train_masks, val_masks, y_train, y_val = train_test_split(
    X_train, train_masks, y_train, test_size=0.1, random_state=42
)

# Verify the shapes of the train and validation sets
print("Shapes after split:")
print("X_train shape:", X_train.shape)
print("train_masks shape:", train_masks.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("val_masks shape:", val_masks.shape)
print("y_val shape:", y_val.shape)

# Train model

In [None]:
## Train model
model = train_advanced_model(X_train, y_train, X_val, y_val, device=device)

In [None]:
# Evaluate model
evaluate_model(model, (torch.tensor(X_test[0]), torch.tensor(X_test[1])), y_test, device=device)
