In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# === Clean Text Function ===
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", '', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#\w+', '', text)
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(tokens)

# === Load Data ===
videos = pd.read_csv('videos_combined.csv')
comments = pd.read_csv('comments_combined.csv')
replies = pd.read_csv('replies_combined.csv')
edges = pd.read_csv('edges_combined.csv')

# === Drop Duplicates ===
videos = videos.drop_duplicates(subset=['video_id'])
comments = comments.drop_duplicates()
replies = replies.drop_duplicates()
edges = edges.drop_duplicates()

# === Drop Rows with Essential Nulls ===
videos = videos.dropna(subset=['description'])
comments = comments.dropna(subset=['text', 'author'])
replies = replies.dropna(subset=['text', 'author'])
edges = edges.dropna()

# === Convert Date Columns to Datetime ===
videos['published'] = pd.to_datetime(videos['published'], errors='coerce')
comments['published_at'] = pd.to_datetime(comments['published_at'], errors='coerce')
replies['published_at'] = pd.to_datetime(replies['published_at'], errors='coerce')

# === Clean Text Columns ===
print("Cleaning video descriptions...")
videos['cleaned_description'] = videos['description'].apply(clean_text)

print(" Cleaning comment text...")
comments['cleaned_text'] = comments['text'].apply(clean_text)

print(" Cleaning reply text...")
replies['cleaned_text'] = replies['text'].apply(clean_text)

# === Clean Tags, Mentions, Hashtags in videos.csv ===
for col in ['tags', 'mentions', 'hashtags']:
    if col in videos.columns:
        videos[col] = (
            videos[col].fillna('')
            .astype(str)
            .str.lower()
            .str.replace(r'[^a-z0-9, ]', '', regex=True)
        )

# === Clean edges.csv columns ===
print("🧼 Cleaning edges data...")
edges['source_channelId'] = edges['source_channelId'].astype(str).str.strip().str.lower()
edges['target_channelId'] = edges['target_channelId'].astype(str).str.strip().str.lower()
edges['video_ids'] = edges['video_ids'].astype(str).str.strip()
edges['mention_count'] = pd.to_numeric(edges['mention_count'], errors='coerce').fillna(0).astype(int)
edges['total_views'] = pd.to_numeric(edges['total_views'], errors='coerce').fillna(0).astype(int)

# === Drop Empty Cleaned Rows ===
videos = videos[videos['cleaned_description'].str.strip() != '']
comments = comments[comments['cleaned_text'].str.strip() != '']
replies = replies[replies['cleaned_text'].str.strip() != '']

# === Save Cleaned Files ===
videos.to_csv('videos_cleaned_text.csv', index=False)
comments.to_csv('comments_cleaned_text.csv', index=False)
replies.to_csv('replies_cleaned_text.csv', index=False)
edges.to_csv('edges_cleaned.csv', index=False)

print("\n Preprocessing complete. Cleaned files saved:")
print("   - videos_cleaned_text.csv")
print("   - comments_cleaned_text.csv")
print("   - replies_cleaned_text.csv")
print("   - edges_cleaned.csv")

print(f"\nRows after cleaning:")
print(f"Videos:   {len(videos)}")
print(f"Comments: {len(comments)}")
print(f"Replies:  {len(replies)}")
print(f"Edges:    {len(edges)}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


🧼 Cleaning video descriptions...
🧼 Cleaning comment text...
🧼 Cleaning reply text...
🧼 Cleaning edges data...

✅ Preprocessing complete. Cleaned files saved:
   - videos_cleaned_text.csv
   - comments_cleaned_text.csv
   - replies_cleaned_text.csv
   - edges_cleaned.csv

📊 Rows after cleaning:
Videos:   981
Comments: 78785
Replies:  13883
Edges:    3581


In [None]:
# === Print Column Names ===
print("Column names in videos_cleaned_text.csv:")
print(videos.columns.tolist())

print("\nColumn names in comments_cleaned_text.csv:")
print(comments.columns.tolist())

print("\nColumn names in replies_cleaned_text.csv:")
print(replies.columns.tolist())

print("\nColumn names in edges_cleaned.csv:")
print(edges.columns.tolist())


📁 Column names in videos_cleaned_text.csv:
['video_id', 'title', 'channel', 'channelId', 'published', 'description', 'tags', 'mentions', 'hashtags', 'views', 'source_folder', 'cleaned_description']

📁 Column names in comments_cleaned_text.csv:
['comment_id', 'video_id', 'author', 'published_at', 'text', 'source_folder', 'cleaned_text']

📁 Column names in replies_cleaned_text.csv:
['video_id', 'in_reply_to', 'parent_author', 'author', 'published_at', 'text', 'source_folder', 'cleaned_text']

📁 Column names in edges_cleaned.csv:
['source_channelId', 'target_channelId', 'mention_count', 'total_views', 'video_ids', 'source_channelId_name', 'target_channelId_name', 'source_folder']
