In [None]:
# computer
pip install python-dotenv

api_key = os.getenv('YOUTUBE_API_KEY')


In [None]:
# for colab

from google.colab import userdata
api_key = userdata.get('YOUTUBE_API_KEY')

In [None]:
pip install google-api-python-client
pip install transformers
pip install matplotlib
pip install wordcloud
pip install seaborn
pip install nltk
pip install isodate

In [None]:
import os
import re
import json
import datetime
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import pandas as pd
from dotenv import load_dotenv
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from googleapiclient.discovery import build
import isodate
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# NLTK veri setlerini indir
nltk.download('punkt')
nltk.download('stopwords')

# Çevresel değişkenleri yükle
load_dotenv()

# Duygu analizi için modeller
sentiment_analyzer = pipeline("sentiment-analysis", model="cardiffnlp/twitter-roberta-base-sentiment")
emotion_analyzer = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

MAX_TOKENS = 512

def get_youtube_service():
    """YouTube API istemcisini başlat."""
    if not api_key:
        raise ValueError("Error: YouTube API key not found in environment variables")
    youtube = build('youtube', 'v3', developerKey=api_key)
    return youtube

def extract_video_id(youtube_url):
    """YouTube URL'sinden video ID'sini çıkar."""
    # Farklı URL formatlarını destekle
    regex_patterns = [
        r'(?:v=|\/)([0-9A-Za-z_-]{11}).*',
        r'(?:be\/)([0-9A-Za-z_-]{11}).*',
        r'(?:embed\/)([0-9A-Za-z_-]{11}).*'
    ]
    for pattern in regex_patterns:
        match = re.search(pattern, youtube_url)
        if match:
            return match.group(1)
    raise ValueError("Geçersiz YouTube URL'si. Lütfen doğru bir URL girin.")

def get_video_metadata(video_id, youtube):
    """Belirli bir video hakkında geniş metadata al."""
    response = youtube.videos().list(
        part="snippet,contentDetails,statistics,status,topicDetails,liveStreamingDetails",
        id=video_id
    ).execute()

    if not response['items']:
        raise ValueError("Bu ID ile eşleşen video bulunamadı.")
    
    video_data = response['items'][0]
    duration = isodate.parse_duration(video_data['contentDetails']['duration']).total_seconds()
    metadata = {
        'video_id': video_id,
        'title': video_data['snippet']['title'],
        'description': video_data['snippet']['description'],
        'tags': video_data['snippet'].get('tags', []),
        'channel_title': video_data['snippet']['channelTitle'],
        'channel_id': video_data['snippet']['channelId'],
        'publish_date': video_data['snippet']['publishedAt'],
        'thumbnail_url': video_data['snippet']['thumbnails']['high']['url'],
        'view_count': int(video_data['statistics'].get('viewCount', 0)),
        'like_count': int(video_data['statistics'].get('likeCount', 0)),
        'comment_count': int(video_data['statistics'].get('commentCount', 0)),
        'duration_seconds': duration,
        'licensed_content': video_data['contentDetails'].get('licensedContent', False),
        'privacy_status': video_data['status']['privacyStatus'],
        'embeddable': video_data['status']['embeddable'],
        'public_stats_viewable': video_data['status']['publicStatsViewable'],
        'category_id': video_data['snippet'].get('categoryId', None),
        'topic_categories': video_data.get('topicDetails', {}).get('topicCategories', []),
        'live_broadcast': video_data.get('liveStreamingDetails', {}).get('actualStartTime', None),
        'default_language': video_data['snippet'].get('defaultLanguage')
    }
    return metadata

def truncate_text(text, max_tokens=MAX_TOKENS):
    """Yorum metnini belirli bir token uzunluğuna kısaltır."""
    tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment")
    tokens = tokenizer.encode(text, truncation=True, max_length=max_tokens)
    return tokenizer.decode(tokens, skip_special_tokens=True)

def preprocess_text(text):
    """Metni ön işleme tabi tutar."""
    text = re.sub(r'http\S+', '', text)  # URL'leri kaldır
    text = re.sub(r'@\w+', '', text)     # Kullanıcı etiketlerini kaldır
    text = re.sub(r'#\w+', '', text)     # Hashtag'leri kaldır
    text = re.sub(r'\s+', ' ', text)     # Fazla boşlukları kaldır
    return text.strip()

def get_comment_sentiments(video_id, youtube, max_comments=1000):
    """Yorumları analiz ederek duygusal dağılım, anahtar kelimeler ve en aktif kullanıcıları bulur."""
    comments = []
    sentiment_counts = Counter()
    emotion_counts = Counter()
    active_users = Counter()
    key_phrases = Counter()
    comment_times = []
    
    stop_words = set(stopwords.words('english') + stopwords.words('turkish'))

    next_page_token = None

    while len(comments) < max_comments:
        response = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            textFormat="plainText",
            maxResults=min(max_comments - len(comments), 100),
            pageToken=next_page_token
        ).execute()

        for item in response.get('items', []):
            comment_snippet = item['snippet']['topLevelComment']['snippet']
            comment_text = comment_snippet['textDisplay']
            truncated_text = truncate_text(comment_text)
            preprocessed_text = preprocess_text(truncated_text)
            author = comment_snippet['authorDisplayName']
            like_count = comment_snippet['likeCount']
            published_at = comment_snippet['publishedAt']
            published_at_dt = datetime.datetime.strptime(published_at, "%Y-%m-%dT%H:%M:%SZ")
            comment_times.append(published_at_dt)

            # Duygu analizi
            sentiment = sentiment_analyzer(preprocessed_text)[0]
            sentiment_counts[sentiment['label']] += 1

            # Duygu yoğunluğu
            emotions = emotion_analyzer(preprocessed_text)[0]
            dominant_emotion = max(emotions, key=lambda x: x['score'])
            emotion_counts[dominant_emotion['label']] += 1

            # Anahtar kelime analizi
            words = word_tokenize(preprocessed_text)
            for word in words:
                word = word.lower()
                if word.isalpha() and word not in stop_words and len(word) > 3:
                    key_phrases[word] += 1

            # Aktif kullanıcılar
            active_users[author] += 1

            # Yorum kaydı
            comments.append({
                'comment_text': comment_text,
                'author': author,
                'like_count': like_count,
                'published_at': published_at,
                'sentiment': sentiment['label'],
                'emotion': dominant_emotion['label']
            })

        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break

    # En sık kullanılan 20 anahtar kelime
    top_keywords = key_phrases.most_common(20)
    # En aktif 10 kullanıcı
    top_active_users = active_users.most_common(10)

    return comments, dict(sentiment_counts), dict(emotion_counts), top_keywords, top_active_users, comment_times

def generate_wordcloud(key_phrases):
    """Anahtar kelimelerin kelime bulutunu oluşturur."""
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(dict(key_phrases))
    plt.figure(figsize=(15, 7.5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.savefig('wordcloud.png')
    plt.close()

def plot_sentiment_distribution(sentiment_counts):
    """Duygu dağılımını görselleştirir."""
    labels = list(sentiment_counts.keys())
    sizes = list(sentiment_counts.values())
    plt.figure(figsize=(10, 7))
    sns.barplot(x=labels, y=sizes)
    plt.title('Duygu Dağılımı')
    plt.xlabel('Duygu')
    plt.ylabel('Sayı')
    plt.savefig('sentiment_distribution.png')
    plt.close()

def plot_emotion_distribution(emotion_counts):
    """Duygu yoğunluğu dağılımını görselleştirir."""
    labels = list(emotion_counts.keys())
    sizes = list(emotion_counts.values())
    plt.figure(figsize=(12, 8))
    sns.barplot(x=labels, y=sizes)
    plt.title('Duygu Yoğunluğu Dağılımı')
    plt.xlabel('Duygu')
    plt.ylabel('Sayı')
    plt.savefig('emotion_distribution.png')
    plt.close()

def plot_comment_timeline(comment_times):
    """Yorumların zaman içindeki dağılımını görselleştirir."""
    df = pd.DataFrame({'timestamp': comment_times})
    df.set_index('timestamp', inplace=True)
    df['count'] = 1
    df = df.resample('D').sum()
    plt.figure(figsize=(15, 7))
    df['count'].plot()
    plt.title('Zaman İçinde Yorum Dağılımı')
    plt.xlabel('Tarih')
    plt.ylabel('Yorum Sayısı')
    plt.savefig('comment_timeline.png')
    plt.close()

def save_metadata_to_json(metadata):
    """Video metadata bilgilerini JSON dosyasına kaydeder."""
    with open('video_metadata.json', 'w', encoding='utf-8') as json_file:
        json.dump(metadata, json_file, ensure_ascii=False, indent=4)

def save_sentiment_analysis_to_json(comments, sentiment_counts, emotion_counts, top_keywords, top_active_users):
    """Sentiment analiz sonuçlarını JSON dosyasına kaydeder."""
    data = {
        'comments': comments,
        'sentiment_distribution': sentiment_counts,
        'emotion_distribution': emotion_counts,
        'top_keywords': top_keywords,
        'top_active_users': top_active_users
    }
    with open('sentiment_analysis.json', 'w', encoding='utf-8') as json_file:
        json.dump(data, json_file, ensure_ascii=False, indent=4)

def main():
    youtube_url = input("YouTube Video URL'sini girin: ")
    youtube = get_youtube_service()
    video_id = extract_video_id(youtube_url)
    
    # 1. Video Metadata Analizi
    metadata = get_video_metadata(video_id, youtube)
    save_metadata_to_json(metadata)
    print("Video metadata bilgileri 'video_metadata.json' dosyasına kaydedildi.")

    # 2. Sentiment Analizi ve Ekstra Analizler
    comments, sentiment_counts, emotion_counts, top_keywords, top_active_users, comment_times = get_comment_sentiments(video_id, youtube, max_comments=1000)
    save_sentiment_analysis_to_json(comments, sentiment_counts, emotion_counts, top_keywords, top_active_users)
    print("Sentiment analiz sonuçları 'sentiment_analysis.json' dosyasına kaydedildi.")

    # 3. Görselleştirmeler
    generate_wordcloud(top_keywords)
    print("Kelime bulutu 'wordcloud.png' olarak kaydedildi.")

    plot_sentiment_distribution(sentiment_counts)
    print("Duygu dağılım grafiği 'sentiment_distribution.png' olarak kaydedildi.")

    plot_emotion_distribution(emotion_counts)
    print("Duygu yoğunluğu grafiği 'emotion_distribution.png' olarak kaydedildi.")

    plot_comment_timeline(comment_times)
    print("Yorum zaman çizelgesi 'comment_timeline.png' olarak kaydedildi.")

if __name__ == "__main__":
    main()
