# 🔥 Viral Content Predictor - YouTube Edition
This notebook uses YouTube trending video data to predict whether a video has the potential to go viral based on features like title, tags, sentiment, and more.

In [3]:
pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
   ---------------------------------------- 0.0/624.3 kB ? eta -:--:--
   ---------------- ----------------------- 262.1/624.3 kB ? eta -:--:--
   ---------------------------------------- 624.3/624.3 kB 3.3 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.19.0
Note: you may need to restart the kernel to use updated packages.


In [7]:
# 📦 Import Libraries

import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [15]:
# 📂 Load the Dataset
df = pd.read_csv(r'C:\Users\hp\Documents\viralpredictor\USvideos.csv')
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"last week tonight trump presidency|""last week ...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


In [17]:
# 🔍 Basic Exploration
df.info()
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40949 entries, 0 to 40948
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   video_id                40949 non-null  object
 1   trending_date           40949 non-null  object
 2   title                   40949 non-null  object
 3   channel_title           40949 non-null  object
 4   category_id             40949 non-null  int64 
 5   publish_time            40949 non-null  object
 6   tags                    40949 non-null  object
 7   views                   40949 non-null  int64 
 8   likes                   40949 non-null  int64 
 9   dislikes                40949 non-null  int64 
 10  comment_count           40949 non-null  int64 
 11  thumbnail_link          40949 non-null  object
 12  comments_disabled       40949 non-null  bool  
 13  ratings_disabled        40949 non-null  bool  
 14  video_error_or_removed  40949 non-null  bool  
 15  de

Unnamed: 0,category_id,views,likes,dislikes,comment_count
count,40949.0,40949.0,40949.0,40949.0,40949.0
mean,19.972429,2360785.0,74266.7,3711.401,8446.804
std,7.568327,7394114.0,228885.3,29029.71,37430.49
min,1.0,549.0,0.0,0.0,0.0
25%,17.0,242329.0,5424.0,202.0,614.0
50%,24.0,681861.0,18091.0,631.0,1856.0
75%,25.0,1823157.0,55417.0,1938.0,5755.0
max,43.0,225211900.0,5613827.0,1674420.0,1361580.0


In [19]:
# 🧠 Feature Engineering
df['title_length'] = df['title'].apply(len)
df['desc_length'] = df['description'].apply(lambda x: len(str(x)))
df['tag_count'] = df['tags'].apply(lambda x: len(str(x).split('|')))
df['sentiment'] = df['title'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
df['publish_time'] = pd.to_datetime(df['publish_time'])
df['publish_hour'] = df['publish_time'].dt.hour
df['engagement_rate'] = (df['likes'] + df['comment_count']) / df['views']
df['engagement_rate'] = df['engagement_rate'].replace([np.inf, -np.inf], 0).fillna(0)

In [20]:
# 🎯 Define Viral Video (Top 10% by Views)
threshold = df['views'].quantile(0.90)
df['is_viral'] = (df['views'] >= threshold).astype(int)
df['is_viral'].value_counts()

is_viral
0    36854
1     4095
Name: count, dtype: int64

In [29]:
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,...,ratings_disabled,video_error_or_removed,description,title_length,desc_length,tag_count,sentiment,publish_hour,engagement_rate,is_viral
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13 17:13:01+00:00,SHANtell martin,748374,57527,2966,...,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...,34,1410,1,0.0,17,0.098188,0
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13 07:30:00+00:00,"last week tonight trump presidency|""last week ...",2418783,97185,6146,...,False,False,"One year after the presidential election, John...",62,630,4,0.0,7,0.045431,0
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12 19:05:24+00:00,"racist superman|""rudy""|""mancuso""|""king""|""bach""...",3191434,146033,5339,...,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,53,1177,23,0.0,19,0.048321,0
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13 11:00:04+00:00,"rhett and link|""gmm""|""good mythical morning""|""...",343168,10172,666,...,False,False,Today we find out if Link is a Nickelback amat...,32,1403,27,-0.75,11,0.035895,0
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12 18:01:41+00:00,"ryan|""higa""|""higatv""|""nigahiga""|""i dare you""|""...",2095731,132235,1989,...,False,False,I know it's been a while since we did this sho...,24,636,14,0.0,18,0.071456,0


In [21]:
# 🏗️ Prepare Data for Modeling
features = ['title_length', 'desc_length', 'tag_count', 'sentiment', 'publish_hour', 'engagement_rate']
X = df[features]
y = df['is_viral']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# 🤖 Train a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [31]:
# 📊 Evaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[7328   42]
 [  54  766]]
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7370
           1       0.95      0.93      0.94       820

    accuracy                           0.99      8190
   macro avg       0.97      0.96      0.97      8190
weighted avg       0.99      0.99      0.99      8190



In [34]:
from textblob import TextBlob
import numpy as np
import pandas as pd

def predict_viral(title, description, tags, publish_hour, threshold=0.7):
    """
    Predicts whether a YouTube video will go viral or not.

    Parameters:
    - title: str
    - description: str
    - tags: comma-separated str
    - publish_hour: int (0–23)
    - threshold: float (default: 0.7) to classify as viral

    Returns:
    - is_viral: bool
    - score: float (probability)
    """
    # Extract features
    features = {
        "title_length": len(title),
        "desc_length": len(description),
        "tag_count": len(tags.split(",")),
        "sentiment": TextBlob(title).sentiment.polarity,
        "publish_hour": int(publish_hour),
        "engagement_rate": 0.0  # if not part of your model, drop it
    }

    # Convert to DataFrame
    input_df = pd.DataFrame([features])

    # Predict probability
    score = model.predict_proba(input_df)[0][1]
    is_viral = score >= threshold

    return is_viral, round(score, 2)


In [36]:
is_viral, score = predict_viral(
    title="10 Genius Kitchen Hacks 👩‍🍳",
    description="Save time and cook smarter with these hacks.",
    tags="kitchen,cooking,hacks",
    publish_hour=19
)

print("Prediction:", "🔥 Viral" if is_viral else "❌ Not Viral")
print("Score:", score)


Prediction: ❌ Not Viral
Score: 0.21
