## Import

In [56]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import isodate

## Read raw data from file

In [57]:
raw_df = pd.read_csv("../data/raw/video_data_raw.csv")
raw_df.head()

Unnamed: 0,video_id,channelTitle,title,description,tags,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration,definition,caption
0,WkqM0ndr42c,Data School,My top 50 scikit-learn tips,"If you already know the basics of scikit-learn, but you want to be more efficient and get up-to-...","['python', 'data science', 'machine learning', 'scikit-learn']",2023-04-20T14:56:43Z,8210.0,355.0,,45.0,PT2H47M31S,hd,False
1,tWFQqaRtSQA,Data School,21 more pandas tricks,"You're about to learn 21 tricks that will help you to work faster, write better pandas code, and...","['python', 'pandas', 'data analysis', 'data science']",2022-05-12T16:20:41Z,43576.0,1379.0,,67.0,PT24M40S,hd,False
2,gd-TZut-oto,Data School,Adapt this pattern to solve many Machine Learning problems,Here's a simple pattern that can be adapted to solve many ML problems. It has plenty of shortcom...,"['python', 'machine learning', 'scikit-learn', 'data science']",2021-10-28T16:16:39Z,11707.0,435.0,,16.0,PT7M49S,hd,False
3,v2QpvCJ1ar8,Data School,Tune multiple models simultaneously with GridSearchCV,You can tune 2+ models using the same grid search! Here's how:\n1. Create multiple parameter dic...,"['python', 'machine learning', 'scikit-learn', 'data science']",2021-10-26T13:53:24Z,6453.0,198.0,,14.0,PT5M7S,hd,False
4,sMlsd2CnIf4,Data School,Access part of a Pipeline using slicing,Want to operate on part of a Pipeline (instead of the whole thing)? Slice it using Python's slic...,"['python', 'machine learning', 'scikit-learn', 'data science']",2021-10-21T14:49:56Z,2413.0,64.0,,4.0,PT3M38S,hd,False


- In here, we will change the format of `tags` string.

In [58]:
raw_df['tags'] = raw_df['tags'].apply(lambda x: '|'.join(x.strip("[]").replace("'", "").split(', ')) if isinstance(x, str) else x)
raw_df['tags']

0                                                          python|data science|machine learning|scikit-learn
1                                                                   python|pandas|data analysis|data science
2                                                          python|machine learning|scikit-learn|data science
3                                                          python|machine learning|scikit-learn|data science
4                                                          python|machine learning|scikit-learn|data science
                                                        ...                                                 
60027    free fme license|fme license key|fme home license|fme software|fme|safe software fme|fme data in...
60028    s3connector|api to s3 fme|httpcaller|fme data integration|fme software|safe software|api to s3|f...
60029    temppathnamecreater|FME Transformer Guide|temporay paths fme|fme software|fme software tutorial|...
60030    csv to sha

### How many rows and how many columns does the raw data have?

In [59]:
data_video_shape = raw_df.shape
print(f"Video data current shape: {data_video_shape}")

Video data current shape: (60032, 13)


### What is the meaning of each row?

- Answer: Based on the observations of the dataset, we suppose that each row represents a unique set of details regarding individual YouTube videos

### What does each column mean?


<table style="width: 100%; border-collapse: collapse;">
  <thead>
    <tr>
      <th style="min-width: 10px; width: 30px; background-color: #04B1CC; color: white; font-size :15px; font-weight: bold; text-align: left; padding: 8px;">No</th>
      <th style="background-color: #04B1CC; color: white; font-size :15px; font-weight: bold; text-align: left; padding: 8px;">Columns</th>
      <th style="background-color: #04B1CC; color: white; font-size :15px; font-weight: bold; text-align: left; padding: 8px;">Meaning</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td style="font-size: 14px; text-align: left;">1</td>
      <td style="font-size: 14px; text-align: left;">video_id</td>
      <td style="font-size: 14px; text-align: left;">Unique identifier for the YouTube video.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">2</td>
      <td style="font-size: 14px; text-align: left;">channelTitle</td>
      <td style="font-size: 14px; text-align: left;">The name of the channel that uploaded the video.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">3</td>
      <td style="font-size: 14px; text-align: left;">title</td>
      <td style="font-size: 14px; text-align: left;">Title of the video.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">4</td>
      <td style="font-size: 14px; text-align: left;">description</td>
      <td style="font-size: 14px; text-align: left;">Description or summary of the video content.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">5</td>
      <td style="font-size: 14px; text-align: left;">tags</td>
      <td style="font-size: 14px; text-align: left;">Keywords or tags associated with the video.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">6</td>
      <td style="font-size: 14px; text-align: left;">publishedAt</td>
      <td style="font-size: 14px; text-align: left;">Date and time when the video was published.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">7</td>
      <td style="font-size: 14px; text-align: left;">viewCount</td>
      <td style="font-size: 14px; text-align: left;">Number of views the video has accumulated.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">8</td>
      <td style="font-size: 14px; text-align: left;">likeCount</td>
      <td style="font-size: 14px; text-align: left;">Count of likes received by the video.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">9</td>
      <td style="font-size: 14px; text-align: left;">favoriteCount</td>
      <td style="font-size: 14px; text-align: left;">Deprecated; used to track how many times viewers added the video to their favorites.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">10</td>
      <td style="font-size: 14px; text-align: left;">commentCount</td>
      <td style="font-size: 14px; text-align: left;">Number of comments posted on the video.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">11</td>
      <td style="font-size: 14px; text-align: left;">duration</td>
      <td style="font-size: 14px; text-align: left;">Length of the video.</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">12</td>
      <td style="font-size: 14px; text-align: left;">definition</td>
      <td style="font-size: 14px; text-align: left;">Video resolution or quality (e.g., HD, SD).</td>
    </tr>
    <tr>
      <td style="font-size: 14px; text-align: left;">13</td>
      <td style="font-size: 14px; text-align: left;">caption</td>
      <td style="font-size: 14px; text-align: left;"> Indicates whether closed captions are available for the video.</td>
    </tr>
  </tbody>
</table>

### Does the raw data have duplicate rows?

In [60]:
# retrieve the index
index = raw_df.index
# create a Pandas Series indicating whether each index is duplicated or not
deDupSeries = index.duplicated(keep='first')
# calculate the number of duplicated rows
num_duplicated_rows = deDupSeries.sum()

In [61]:
if num_duplicated_rows == 0:
    print(f"Raw data have no duplicated line !")
else:
    if num_duplicated_rows > 1:
        ext = "lines"
    else:
        ext = "line"
    print(f"Raw data have {num_duplicated_rows} duplicated " + ext)

Raw data have no duplicated line !


### What data type does each column currently have? Are there any columns having inappropriate data types?

In [62]:
raw_df.dtypes

video_id           object
channelTitle       object
title              object
description        object
tags               object
publishedAt        object
viewCount         float64
likeCount         float64
favouriteCount    float64
commentCount      float64
duration           object
definition         object
caption              bool
dtype: object

- We notice that the columns `publishedAt` and `duration` are currently of object type. Given that they represent time periods, it is advisable to convert `publishedAt`  to datetime type. As the 'duration' is in ISO 8601 format, we'll convert it into a float representing the total number of seconds

In [63]:
# convert publishedAt to datetime
raw_df["publishedAt"] = pd.to_datetime(raw_df["publishedAt"])
# convert duration to float
raw_df['duration'] = raw_df['duration'].apply(lambda x: isodate.parse_duration(x))
raw_df['duration'] = raw_df['duration'].dt.total_seconds()
raw_df['duration'] = raw_df['duration'].astype(np.int64)

Since the `publishedAt` column only carries general values, we can extract more useful information from this data. The special thing is that the extracted information will be categorical values.

In [64]:
if 'publishedAt' in raw_df.columns:
    # Extract time features from `publishedAt` column
    raw_df['hour'] = raw_df['publishedAt'].dt.hour
    raw_df['day_of_week'] = raw_df['publishedAt'].dt.dayofweek  # Monday = 0, Sunday = 6
    raw_df['month'] = raw_df['publishedAt'].dt.month

    # Change datatype of the above columns to categorical
    raw_df['hour'] = raw_df['hour'].astype('str')
    raw_df['day_of_week'] = raw_df['day_of_week'].astype('str')
    raw_df['month'] = raw_df['month'].astype('str')

In [65]:
# TEST
raw_df.dtypes

video_id                       object
channelTitle                   object
title                          object
description                    object
tags                           object
publishedAt       datetime64[ns, UTC]
viewCount                     float64
likeCount                     float64
favouriteCount                float64
commentCount                  float64
duration                        int64
definition                     object
caption                          bool
hour                           object
day_of_week                    object
month                          object
dtype: object

### With each numerical column, how are values distributed?

For columns with numeric data types, we will calculate some basic statistics:

* Percentage (from 0 to 100) of missing values
* The min
* The lower quartile (phân vị 25)
* The median (phân vị 50)
* The upper quartile (phân vị 75)
* The max
Then we will observe and comment to see if the results are unusual?

In [66]:
num_col_info_df = raw_df.select_dtypes(exclude=['object', 'bool'])

def missing_ratio(s):
    return (s.isna().mean() * 100)

def median(df):
    return (df.quantile(0.5))

def lower_quartile(df):
    return (df.quantile(0.25))

def upper_quartile(df):
    return (df.quantile(0.75))

num_col_info_df = num_col_info_df.agg([missing_ratio, "min", lower_quartile, median, upper_quartile, "max"])
num_col_info_df

Unnamed: 0,publishedAt,viewCount,likeCount,favouriteCount,commentCount,duration
missing_ratio,0.0,0.006663113,0.404784,100.0,1.279318,0.0
min,2006-10-25 10:28:09+00:00,0.0,0.0,,0.0,0.0
lower_quartile,2019-02-18 04:02:18+00:00,622.0,11.0,,0.0,293.0
median,2020-12-22 09:26:05.500000+00:00,3059.5,55.0,,5.0,695.0
upper_quartile,2022-07-05 06:04:02.500000+00:00,16021.75,334.0,,27.0,1786.0
max,2023-11-24 13:04:35+00:00,34476450.0,571358.0,,60054.0,92218.0


- We notice that `favouriteCount` does not have any value, so we can remove this feature.

In [67]:
# remove favouriteCount
if "favouriteCount" in raw_df.columns:
    raw_df = raw_df.drop('favouriteCount', axis=1)

-  `viewCount`, `likeCount` and `commentCount` have little missing value, we'll fill it with the median.

In [68]:
# fill missing value with median
missing_cols = ['viewCount', 'likeCount', 'commentCount']
for col in missing_cols:
    raw_df[col] = raw_df[col].fillna(raw_df[col].median().__round__(0))
    raw_df[col] = raw_df[col].astype(np.int64)

In [69]:
# TEST
raw_df.select_dtypes(exclude=['object', 'bool'])\
    .agg([missing_ratio, "min", lower_quartile, median, upper_quartile, "max"])

Unnamed: 0,publishedAt,viewCount,likeCount,commentCount,duration
missing_ratio,0.0,0.0,0.0,0.0,0.0
min,2006-10-25 10:28:09+00:00,0.0,0.0,0.0,0.0
lower_quartile,2019-02-18 04:02:18+00:00,622.0,11.0,1.0,293.0
median,2020-12-22 09:26:05.500000+00:00,3060.0,55.0,5.0,695.0
upper_quartile,2022-07-05 06:04:02.500000+00:00,16017.25,332.0,26.0,1786.0
max,2023-11-24 13:04:35+00:00,34476453.0,571358.0,60054.0,92218.0


### With each categorical column, how are values distributed?

For columns with non-numeric data types, we calculate:

* Percentage (from 0 to 100) of missing values
* Number of different values (and we do not consider missing values)
* Show a few values and percentage (from 0 to 100) of each value sorted by decreasing percentage (we do not consider missing values, the ratio is the ratio compared to the number of non-missing values)
  
Then we will observe and comment to see if the results are unusual?

In [70]:
pd.set_option('display.max_colwidth', 100) # For clearly
pd.set_option('display.max_columns', None) # For clearly

In [71]:
cat_col_info_df = raw_df.select_dtypes(include=['object', 'bool'])

def missing_ratio(s):
    return (s.isna().mean() * 100)

def num_values(s):
    s = s.astype('str').str.split(';')
    s = s.explode()
    return len(s.value_counts())

def value_ratios(s):
    s = s.astype('str').str.split(';')
    s = s.explode()
    totalCount = (~s.isna()).sum()
    return ((s.value_counts()/totalCount*100).round(1)).to_dict()

cat_col_info_df = cat_col_info_df.agg([missing_ratio, num_values, value_ratios])
cat_col_info_df

Unnamed: 0,video_id,channelTitle,title,description,tags,definition,caption,hour,day_of_week,month
missing_ratio,0.0,0.0,0.0,2.803505,18.956557,0.0,0.0,0.0,0.0,0.0
num_values,60032,160,59644,53351,35545,2,2,24,7,12
value_ratios,"{'WkqM0ndr42c': 0.0, 'ls8hEDjhKU0': 0.0, 'Fh0ArGT2_b0': 0.0, '_tHCoU5TZZg': 0.0, 'fXAvXLOUjlU': ...","{'itversity': 5.8, 'Databricks': 4.7, 'Great Learning': 3.1, 'Analytics India Magazine': 3.0, 'K...","{'TL': 0.1, 'Data Analyst MENTORSHIP - Q&A (while I drink coffee)': 0.1, 'Machine Learning Summ...","{'nan': 2.7, 'Connect with me or follow me at https://www.linkedin.com/in/durga0gadiraju https:/...","{'nan': 18.9, 'Databricks': 2.2, 'CBMM|Center for Brains Minds and Machines|Artificial Intellige...","{'hd': 95.5, 'sd': 4.5}","{'False': 88.6, 'True': 11.4}","{'15': 8.6, '13': 8.5, '16': 7.8, '14': 7.6, '12': 6.3, '17': 5.3, '4': 4.5, '11': 4.4, '18': 3....","{'2': 17.2, '3': 16.5, '0': 16.2, '1': 15.7, '4': 15.1, '5': 10.1, '6': 9.3}","{'10': 8.9, '7': 8.9, '3': 8.8, '4': 8.7, '6': 8.7, '11': 8.6, '1': 8.4, '8': 8.2, '5': 8.2, '9'..."


### Min? Max? Are they abnormal?

In [72]:
raw_df.describe()

Unnamed: 0,viewCount,likeCount,commentCount,duration
count,60032.0,60032.0,60032.0,60032.0
mean,48489.31,1120.36274,69.066015,1496.953575
std,358507.3,8272.3426,529.35471,2475.709199
min,0.0,0.0,0.0,0.0
25%,622.0,11.0,1.0,293.0
50%,3060.0,55.0,5.0,695.0
75%,16017.25,332.0,26.0,1786.0
max,34476450.0,571358.0,60054.0,92218.0


### Save the processed data

In [73]:
print(f"Total number of features: {raw_df.shape[1]}")
raw_df.dtypes

Total number of features: 15


video_id                     object
channelTitle                 object
title                        object
description                  object
tags                         object
publishedAt     datetime64[ns, UTC]
viewCount                     int64
likeCount                     int64
commentCount                  int64
duration                      int64
definition                   object
caption                        bool
hour                         object
day_of_week                  object
month                        object
dtype: object

In [74]:
# Save processed data to disk
raw_df.to_csv("../data/processed/" + "video_data_processed.csv", index=False)