In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from collections import Counter
import networkx as nx
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Load dataset
comments_df = pd.read_csv("youtube_comments_dataset.csv")

# Text processing
comments_df['tokenized_text'] = comments_df['comment_text'].apply(word_tokenize)
comments_df['pos_tags'] = comments_df['tokenized_text'].apply(pos_tag)

# Identify interactions
reply_graph = nx.Graph()
for index, row in comments_df.iterrows():
    if row['reply'] == 'yes':
        reply_graph.add_edge(row['commenter_name'], row['parent_commenter_name'])

# Pronoun variation analysis
pronouns = ['I', 'you', 'he', 'she', 'it', 'we', 'they']
pronoun_counts = Counter()
for tags in comments_df['pos_tags']:
    for word, tag in tags:
        if tag == 'PRP' and word.lower() in pronouns:
            pronoun_counts[word.lower()] += 1

# Theme extraction using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(comments_df['comment_text'])
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(tfidf_matrix)

# Speaker identification
speaker_groups = comments_df.groupby('commenter_name')

# Example output
print("Top pronouns used:")
print(pronoun_counts.most_common(5))

# Example visualization
import matplotlib.pyplot as plt

nx.draw(reply_graph, with_labels=True)
plt.show()

