# Data Exploration

In [1]:
import pandas as pd
import json

videos = pd.read_csv('../raw_data/INvideos.csv')
print("Dataset Shape:", videos.shape)
videos.head()

videos.info()
videos.isnull().sum()

with open('../raw_data/IN_category_id.json', 'r') as f:
    data = json.load(f)

categories = {int(item['id']): item['snippet']['title'] for item in data['items']}
print("Category Mapping Example:", list(categories.items())[:5])


Dataset Shape: (37352, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37352 entries, 0 to 37351
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                37352 non-null  object
 1   trending_date           37352 non-null  object
 2   title                   37352 non-null  object
 3   channel_title           37352 non-null  object
 4   category_id             37352 non-null  int64 
 5   publish_time            37352 non-null  object
 6   tags                    37352 non-null  object
 7   views                   37352 non-null  int64 
 8   likes                   37352 non-null  int64 
 9   dislikes                37352 non-null  int64 
 10  comment_count           37352 non-null  int64 
 11  thumbnail_link          37352 non-null  object
 12  comments_disabled       37352 non-null  bool  
 13  ratings_disabled        37352 non-null  bool  
 14  video_error_or_removed  373

# Data Cleaning & Transformation

In [4]:
import pandas as pd
import json

videos = pd.read_csv('../raw_data/INvideos.csv')

videos.drop_duplicates(inplace=True)

videos.fillna({'description': 'No description'}, inplace=True)

videos['trending_date'] = pd.to_datetime(videos['trending_date'], format='%y.%d.%m', errors='coerce')
videos['publish_time'] = pd.to_datetime(videos['publish_time'], errors='coerce')

with open('../raw_data/IN_category_id.json', 'r') as f:
    data = json.load(f)

category_map = {int(item['id']): item['snippet']['title'] for item in data['items']}
videos['category_name'] = videos['category_id'].map(category_map)

videos['like_ratio'] = (videos['likes'] / videos['views']).fillna(0)
videos['comment_ratio'] = (videos['comment_count'] / videos['views']).fillna(0)

videos.to_csv('../raw_data/cleaned_INvideos.csv', index=False)
print("✅ Cleaned data saved successfully!")


✅ Cleaned data saved successfully!
