In [35]:
import pandas as pd

## Datasets & Cleaning

### Youtube Viewership Data

In [36]:
# Loading the data
youtube_data = pd.read_csv("../assets/raw/yt_counter_strike_monthly_top50.csv")

In [37]:
youtube_fields = ['month', 'title', 'description', 'viewCount', 'likeCount', 'commentCount', 'favoriteCount', 'categoryId'] # Fields of interest

# Setting the date field column
youtube_data = youtube_data.drop(columns=["month"], errors="ignore")
youtube_data = youtube_data.rename(columns={"publishedAt": "month"})

youtube_data = youtube_data[youtube_fields]

# Converting to datetime and normalizing to month start 
youtube_data["month"] = pd.to_datetime(youtube_data["month"]).dt.tz_localize(None).dt.to_period("M").dt.to_timestamp()
youtube_data = youtube_data.sort_values(by="viewCount", ascending=False)

youtube_data['viewCount'] = youtube_data['viewCount'].astype("Int64")
youtube_data = youtube_data[1:] # Removing the first row which is an outlier

In [38]:
# Sorting dataset
youtube_data_sorted_by_viewCount = youtube_data.sort_values("viewCount")

In [39]:
# Grouping by month and summing up the views, likes, comments, and favorites
youtube_data_views_by_month = youtube_data.groupby("month", as_index=False)[["viewCount", "likeCount", "commentCount", "favoriteCount"]].sum()
youtube_data_views_by_month = youtube_data_views_by_month.reset_index(drop=True)

In [40]:
# Saving the cleaned data
youtube_data_views_by_month.to_csv("../assets/clean/Youtube_Counter-Strike_Clean.csv", encoding="utf-8", index=False, header=True)