In [1]:
import pandas as pd 
import numpy as np

#### Load the dataset 

In [5]:
file_name = 'malayalam_youtube_tech_data_final.csv'
df = pd.read_csv(file_name)

print(f"\nSuccessfully loaded '{file_name}'")
print(f"Your dataset has {len(df)} rows and {len(df.columns)} columns.")


Successfully loaded 'malayalam_youtube_tech_data_final.csv'
Your dataset has 16806 rows and 6 columns.


In [7]:
print(df.head())

                                         video_title  view_count  like_count  \
0  Giant Chakra | Diwali Special | ഈ ചക്രം കറങ്ങു...      267981       11664   
1  Giant Flower Pot | Diwali Special | ആന മേശപ്പൂ...      318282       14182   
2  ഞങ്ങൾക്ക് ഒരു മകൾ ജനിച്ചു | Our First Baby | M...     1125747       28505   
3  Robo War | Red Vs Blue | Who Will Be The Winne...      582108       13896   
4  Giant Avoli Fish Grill | ആവോലിമീനെ ചുട്ടത് | M...      917272       24545   

   comment_count channel_name  subscriber_count  
0            305      M4 Tech          13700000  
1            271      M4 Tech          13700000  
2           1274      M4 Tech          13700000  
3            341      M4 Tech          13700000  
4            551      M4 Tech          13700000  


#### Inspect Data

In [10]:
df.shape

(16806, 6)

In [12]:
df.columns

Index(['video_title', 'view_count', 'like_count', 'comment_count',
       'channel_name', 'subscriber_count'],
      dtype='object')

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16806 entries, 0 to 16805
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   video_title       16806 non-null  object
 1   view_count        16806 non-null  int64 
 2   like_count        16806 non-null  int64 
 3   comment_count     16806 non-null  int64 
 4   channel_name      16806 non-null  object
 5   subscriber_count  16806 non-null  int64 
dtypes: int64(4), object(2)
memory usage: 787.9+ KB


#### Handle Missing Values 

In [17]:
missing_values = df.isnull().sum()
print(missing_values)

video_title         0
view_count          0
like_count          0
comment_count       0
channel_name        0
subscriber_count    0
dtype: int64


##### Check for Duplicates

In [22]:
duplicate_rows = df.duplicated().sum()
print(f"Found {duplicate_rows} duplicate rows.")

if duplicate_rows > 0:

    df = df.drop_duplicates()
    print(f"Removed {duplicate_rows} duplicates. Your dataset now has {len(df)} rows.")
else:
    print("Result: No duplicate rows found. Your data is unique.")

Found 0 duplicate rows.
Result: No duplicate rows found. Your data is unique.


#### Get Descriptive Statistics

In [26]:
print(df.describe())

         view_count    like_count  comment_count  subscriber_count
count  1.680600e+04  1.680600e+04   16806.000000      1.680600e+04
mean   4.968143e+05  2.287032e+04     444.499107      1.826776e+06
std    5.889492e+06  1.342751e+05    1136.138859      2.556994e+06
min    0.000000e+00  0.000000e+00       0.000000      3.250000e+03
25%    2.495475e+04  1.171000e+03      56.000000      1.020000e+06
50%    6.673950e+04  3.628500e+03     183.000000      1.270000e+06
75%    1.956605e+05  1.145325e+04     457.000000      1.820000e+06
max    4.190372e+08  9.036811e+06   45356.000000      1.370000e+07


#### Feature Engineering

In [29]:
# 1. Create Like-to-View Ratio (Approval Score)
df['like_ratio'] = df['like_count'] / (df['view_count'] + 1)

# 2. Create Comment-to-View Ratio (Engagement Score)
df['comment_ratio'] = df['comment_count'] / (df['view_count'] + 1)

# 3. Create Like-to-Comment Ratio (Discussion Score)
df['discussion_ratio'] = df['like_count'] / (df['comment_count'] + 1)

print("Successfully created 3 new feature columns!")

# Let's look at the data again with the new columns
print("\n--- Data Head (with new 'ratio' columns) ---")
print(df.head())

Successfully created 3 new feature columns!

--- Data Head (with new 'ratio' columns) ---
                                         video_title  view_count  like_count  \
0  Giant Chakra | Diwali Special | ഈ ചക്രം കറങ്ങു...      267981       11664   
1  Giant Flower Pot | Diwali Special | ആന മേശപ്പൂ...      318282       14182   
2  ഞങ്ങൾക്ക് ഒരു മകൾ ജനിച്ചു | Our First Baby | M...     1125747       28505   
3  Robo War | Red Vs Blue | Who Will Be The Winne...      582108       13896   
4  Giant Avoli Fish Grill | ആവോലിമീനെ ചുട്ടത് | M...      917272       24545   

   comment_count channel_name  subscriber_count  like_ratio  comment_ratio  \
0            305      M4 Tech          13700000    0.043525       0.001138   
1            271      M4 Tech          13700000    0.044558       0.000851   
2           1274      M4 Tech          13700000    0.025321       0.001132   
3            341      M4 Tech          13700000    0.023872       0.000586   
4            551      M4 Tech          

In [31]:
print(df.head())

                                         video_title  view_count  like_count  \
0  Giant Chakra | Diwali Special | ഈ ചക്രം കറങ്ങു...      267981       11664   
1  Giant Flower Pot | Diwali Special | ആന മേശപ്പൂ...      318282       14182   
2  ഞങ്ങൾക്ക് ഒരു മകൾ ജനിച്ചു | Our First Baby | M...     1125747       28505   
3  Robo War | Red Vs Blue | Who Will Be The Winne...      582108       13896   
4  Giant Avoli Fish Grill | ആവോലിമീനെ ചുട്ടത് | M...      917272       24545   

   comment_count channel_name  subscriber_count  like_ratio  comment_ratio  \
0            305      M4 Tech          13700000    0.043525       0.001138   
1            271      M4 Tech          13700000    0.044558       0.000851   
2           1274      M4 Tech          13700000    0.025321       0.001132   
3            341      M4 Tech          13700000    0.023872       0.000586   
4            551      M4 Tech          13700000    0.026759       0.000601   

   discussion_ratio  
0         38.117647  
1     

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16806 entries, 0 to 16805
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   video_title       16806 non-null  object 
 1   view_count        16806 non-null  int64  
 2   like_count        16806 non-null  int64  
 3   comment_count     16806 non-null  int64  
 4   channel_name      16806 non-null  object 
 5   subscriber_count  16806 non-null  int64  
 6   like_ratio        16806 non-null  float64
 7   comment_ratio     16806 non-null  float64
 8   discussion_ratio  16806 non-null  float64
dtypes: float64(3), int64(4), object(2)
memory usage: 1.2+ MB
