In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('youtube_final_data.csv')

# Inspecting the dataset

In [4]:
df.head()

Unnamed: 0,Title,Video_length,Video_link,Likes,Views,Publish_date,Comments
0,Operation Sindoor: Brain + Bravery Based JEE Q...,10:12,https://www.youtube.com/watch?v=dVlf8zPZavQ&pp...,298,6268,"May 14, 2025",34
1,🔥 JEE Advanced 2025: Last-Minute Qualitative A...,04:53,https://www.youtube.com/watch?v=LlqfJW-aXC4,450,8986,"May 13, 2025",48
2,What to Do After CBSE 12th Result 2025? | Comp...,07:15,https://www.youtube.com/watch?v=x3UsrLDtTuM,1091,42181,"May 13, 2025",621
3,JEE ADVANCED ADMIT CARD 2025: MUST-KNOW RULES ...,05:37,https://www.youtube.com/watch?v=JQ6f_7m1Qcs,500,21052,"May 12, 2025",152
4,Important Lessons from NEET 2025 for Future JE...,18:57,https://www.youtube.com/watch?v=l_srV0ZI1Ws,447,9717,"May 6, 2025",106


In [5]:
df.tail()

Unnamed: 0,Title,Video_length,Video_link,Likes,Views,Publish_date,Comments
1275,,,,,,,
1276,,,,,,,
1277,,,,,,,
1278,,,,,,,
1279,,,,,,,


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1280 entries, 0 to 1279
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Title         1248 non-null   object
 1   Video_length  1248 non-null   object
 2   Video_link    1248 non-null   object
 3   Likes         1246 non-null   object
 4   Views         1247 non-null   object
 5   Publish_date  1247 non-null   object
 6   Comments      1225 non-null   object
dtypes: object(7)
memory usage: 70.1+ KB


In [7]:
df.describe()

Unnamed: 0,Title,Video_length,Video_link,Likes,Views,Publish_date,Comments
count,1248,1248,1248,1246,1247,1247,1225
unique,1187,1018,1198,940,1189,833,360
top,JEE Main 2021 Question Paper Solutions | 20th ...,01:00,https://www.youtube.com/watch?v=STG1U-46UAQ,368,8263,"Jul 20, 2021",7
freq,36,11,6,8,6,39,17


In [8]:
df.isnull().sum()

Title           32
Video_length    32
Video_link      32
Likes           34
Views           33
Publish_date    33
Comments        55
dtype: int64

# Data Cleaning and Preprocessing

In [10]:
df = df.dropna(subset = ['Title'])

In [11]:
df.isnull().sum()

Title            0
Video_length     0
Video_link       0
Likes            2
Views            1
Publish_date     1
Comments        23
dtype: int64

In [12]:
df= df.dropna(subset = ['Views','Likes'])

In [13]:
df.fillna(0, inplace = True)

In [14]:
df.isnull().sum()

Title           0
Video_length    0
Video_link      0
Likes           0
Views           0
Publish_date    0
Comments        0
dtype: int64

In [15]:
df.dtypes

Title           object
Video_length    object
Video_link      object
Likes           object
Views           object
Publish_date    object
Comments        object
dtype: object

In [16]:
df['Likes'] = df['Likes'].astype(str).str.replace(',', '').astype(int)
df['Views'] = df['Views'].astype(str).str.replace(',', '').astype(int)
df['Comments'] = df['Comments'].astype(str).str.replace(',', '').astype(int)
df['Publish_date']= df['Publish_date'].str.replace('Premiered ', '')
df['Publish_date'] = pd.to_datetime(df['Publish_date'], format='%b %d, %Y')

In [34]:
df.dtypes

Title                   object
Video_length            object
Video_link              object
Likes                    int32
Views                    int32
Publish_date    datetime64[ns]
Comments                 int32
dtype: object

# Feature Engineering

In [41]:
def duration_in_seconds(duration):
    parts = list(map(int, duration.split(':')))
    if len(parts)==3:
        h,m,s = parts
    elif len(parts)==2:
        h=0
        m,s = parts
    else:
        return 0
    return h*3600 + m*60 +s

In [43]:
df['Video_length_seconds'] = df['Video_length'].apply(duration_in_seconds)

In [45]:
df.dtypes

Title                           object
Video_length                    object
Video_link                      object
Likes                            int32
Views                            int32
Publish_date            datetime64[ns]
Comments                         int32
Video_length_seconds             int64
dtype: object

In [53]:
df.to_csv('Final_data.csv', index=False)