<a href="https://colab.research.google.com/github/Abdulxmannan/Machine-learning/blob/main/senti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install youtube-transcript-api google-api-python-client pandas
!pip install datasets

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl.metadata (15 kB)
Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.2
Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (fr

In [18]:


import re
import csv
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi
from googleapiclient.discovery import build

# Replace with your own YouTube Data API key
API_KEY = 'AIzaSyDgK73QpiaztzQLWqB-AinO-ftp_mLJplw'

def get_video_id(url):
    if "shorts" in url:
        match = re.search(r'shorts/([a-zA-Z0-9_-]+)', url)
    else:
        match = re.search(r'v=([a-zA-Z0-9_-]+)', url)

    return match.group(1) if match else None

def get_transcript(video_id):
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        return " ".join([entry['text'] for entry in transcript])
    except Exception as e:
        print(f"Could not retrieve transcript for {video_id}: {e}")
        return None

def get_video_metadata(video_id):
    youtube = build('youtube', 'v3', developerKey=API_KEY)
    request = youtube.videos().list(part="snippet", id=video_id)
    response = request.execute()

    if response['items']:
        snippet = response['items'][0]['snippet']
        return snippet['title'], snippet['description']
    return None, None

def download_video_data(urls):
    data = []
    for url in urls:
        video_id = get_video_id(url)
        if not video_id:
            print(f"Invalid URL: {url}")
            continue

        transcript = get_transcript(video_id)
        if not transcript:
            continue

        title, description = get_video_metadata(video_id)
        if not title or not description:
            print(f"Could not retrieve metadata for {video_id}")
            continue

        data.append({
            "video_id": video_id,
            "url": url,
            "title": title,
            "description": description,
            "transcript": transcript
        })

    return data

def save_data_to_csv(data, filename):
    if not data:
        print("No data to save.")
        return

    keys = data[0].keys()
    with open(filename, 'w', newline='', encoding='utf-8') as output_file:
        dict_writer = csv.DictWriter(output_file, fieldnames=keys)
        dict_writer.writeheader()
        dict_writer.writerows(data)

if __name__ == "__main__":
    # Replace with your list of YouTube video URLs
    urls = [
        'https://www.youtube.com/shorts/pMkew_jYT7I',
        'https://www.youtube.com/watch?v=_1OfB3DGwpA',
        'https://www.youtube.com/watch?v=4SNThp0YiU4',
        'https://www.youtube.com/watch?v=uq_EsHtmfy4',
        # Add more video URLs
    ]

    data = download_video_data(urls)
    save_data_to_csv(data, 'video_data.csv')


In [19]:
def preprocess_text(text):
    # Basic preprocessing: lowercasing, removing punctuation, etc.
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [20]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# Load labeled data
data = pd.read_csv('labeled_video_data.csv')

# Combine title, description, and transcript into one text field
data['text'] = data['title'] + " " + data['description'] + " " + data['transcript']

# Map sentiment labels to numerical values
label_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
data['label'] = data['label'].map(label_mapping)

# Drop rows with None values in 'label' column (if any)
data = data.dropna(subset=['label'])

# Ensure the dataframe has 'text' and 'label' columns
data = data[['text', 'label']]

# Debugging: Check the original dataset size and content
print(f"Original dataset size: {len(data)}")
print(data.head())

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Preprocess the text
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

# Tokenize the dataset
encoded_dataset = dataset.map(preprocess_function, batched=True)

# Debugging: Check dataset size after preprocessing
print(f"Encoded dataset size: {len(encoded_dataset)}")
print(encoded_dataset)

# Function to convert labels to tensor format
def convert_labels_to_tensor(example):
    return {'labels': torch.tensor(example['label'], dtype=torch.long)}

# Convert labels to tensor format
encoded_dataset = encoded_dataset.map(convert_labels_to_tensor)

# Debugging: Check dataset content after converting labels
print(f"Dataset after converting labels: {encoded_dataset}")

# Split into train and test datasets
train_test_split = encoded_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
test_dataset = train_test_split['test']

# Debugging: Check dataset sizes after splitting
print(f"Train dataset size: {len(train_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Check for valid samples in the train dataset
for i in range(min(5, len(train_dataset))):
    print(train_dataset[i])

# Load pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",  # Use 'eval_strategy' instead of 'evaluation_strategy'
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()


FileNotFoundError: [Errno 2] No such file or directory: 'labeled_video_data.csv'

In [None]:
def predict_sentiment(video_url):
    transcript = get_youtube_transcript(video_url)
    title, description = get_video_metadata(video_url)

    # Preprocess the text
    text = preprocess_text(transcript + " " + title + " " + description)

    # Tokenize and encode the text
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding='max_length', max_length=512)

    # Predict
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=1)

    sentiment_labels = {0: 'negative', 1: 'neutral', 2: 'positive'}
    sentiment = sentiment_labels[predictions.item()]

    return sentiment


In [None]:
video_url = "https://www.youtube.com/watch?v=your_video_id"
sentiment = predict_sentiment(video_url)
print(f"The sentiment of the video is: {sentiment}")