<a href="https://colab.research.google.com/github/BrendaG04/Google1D/blob/main/notebooks/RU_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Import Libraries

In [9]:
%pip install pandas numpy matplotlib seaborn wordcloud deep-translator

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


Collecting wordcloud
  Downloading wordcloud-1.9.4-cp313-cp313-macosx_11_0_arm64.whl.metadata (3.4 kB)
Collecting deep-translator
  Downloading deep_translator-1.11.4-py3-none-any.whl.metadata (30 kB)
Downloading wordcloud-1.9.4-cp313-cp313-macosx_11_0_arm64.whl (167 kB)
Downloading deep_translator-1.11.4-py3-none-any.whl (42 kB)
Installing collected packages: deep-translator, wordcloud
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [wordcloud]
[1A[2KSuccessfully installed deep-translator-1.11.4 wordcloud-1.9.4
Note: you may need to restart the kernel to use updated packages.


# Exploratory Data Analysis (EDA) Plan for Russia YouTube Trending Dataset

This notebook follows the same EDA pipeline used for other country datasets (e.g., US, KR, etc.).

Steps included:
1. **Data Overview:** Load and inspect the data.
2. **Feature Understanding:** Review columns, unique values, and data types.
3. **Univariate Analysis:** Examine numeric distributions (views, likes, etc.).
4. **Bivariate/Multivariate Analysis:** Study relationships between features.
5. **Temporal Analysis:** Analyze publish time and trending patterns.
6. **Text Analysis:** Visualize frequent words and tags.
7. **Outlier Detection:** Identify unusual records.
8. **Summary:** Summarize insights.


In [10]:
# Data Overview and Centralized Cleaning for Russia Dataset
data_path = '../data-local/raw/RU_youtube_trending.csv'
youtube_data_ru = pd.read_csv(data_path)

# ---- CLEANING ----
youtube_data_ru_clean = youtube_data_ru.drop_duplicates().copy()
if 'description' in youtube_data_ru_clean.columns:
    youtube_data_ru_clean['description'] = youtube_data_ru_clean['description'].fillna('')

# Inspect data
print("Shape after duplicate removal:", youtube_data_ru_clean.shape)
print("\nData Types:\n", youtube_data_ru_clean.dtypes.head(10))
print("\nMissing Values in key columns:\n", youtube_data_ru_clean[['view_count','likes','dislikes','comment_count']].isnull().sum())

display(youtube_data_ru_clean.head(3))


FileNotFoundError: [Errno 2] No such file or directory: '../data-local/raw/RU_youtube_trending.csv'

In [5]:
# ==== 2) FEATURE UNDERSTANDING ====
num_cols_guess = ["views","likes","dislikes","comment_count","comments","commentCount"]
cat_cols_guess = ["channel_title","channelTitle","category_id","categoryId"]
time_cols_guess = ["publish_time","publishedAt","trending_date","trending_date_time","trending_date_time_utc"]

# Normalize a few common column names (creates new standard columns if needed)
def copy_if_exists(src, dst):
    if src in df.columns and dst not in df.columns:
        df[dst] = df[src]

copy_if_exists("comment_count", "comments")
copy_if_exists("commentCount", "comments")
copy_if_exists("channelTitle", "channel_title")
copy_if_exists("categoryId", "category_id")
copy_if_exists("videoId", "video_id")

# Numeric columns (present in data)
num_cols = [c for c in ["views","likes","dislikes","comments"] if c in df.columns]
cat_cols = [c for c in ["channel_title","category_id"] if c in df.columns]
time_cols = [c for c in time_cols_guess if c in df.columns]

print("Numeric cols:", num_cols)
print("Categorical cols:", cat_cols)
print("Time cols:", time_cols)

# Create engagement ratios if possible
if set(["likes","views"]).issubset(df.columns):
    df["like_ratio"] = df["likes"] / (df["views"].replace(0, np.nan))
if set(["comments","views"]).issubset(df.columns):
    df["comment_ratio"] = df["comments"] / (df["views"].replace(0, np.nan))

for c in ["like_ratio","comment_ratio"]:
    if c in df.columns:
        num_cols.append(c)

# Basic uniques snapshot
summary = []
for c in df.columns:
    uniq = df[c].nunique(dropna=True)
    summary.append((c, str(df[c].dtype), uniq))
pd.DataFrame(summary, columns=["column","dtype","n_unique"]).sort_values("n_unique").head(15)


Numeric cols: ['likes', 'dislikes', 'comments']
Categorical cols: ['channel_title', 'category_id']
Time cols: ['publishedAt', 'trending_date']


Unnamed: 0,column,dtype,n_unique
14,ratings_disabled,bool,2
13,comments_disabled,bool,2
18,category_id,int64,15
5,categoryId,int64,15
6,trending_date,object,1323
3,channelId,object,8058
4,channelTitle,object,8482
17,channel_title,object,8482
10,dislikes,int64,9084
16,comments,int64,20151
