In [None]:
import pandas as pd

df = pd.read_csv("/content/clickbait_data.csv")
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32000 entries, 0 to 31999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   headline   32000 non-null  object
 1   clickbait  32000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 500.1+ KB


In [None]:
df.isnull().sum()

Unnamed: 0,0
headline,0
clickbait,0


In [None]:
from sklearn.model_selection import train_test_split

X = df["headline"]
y = df["clickbait"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
X_train.shape, X_test.shape

((25600,), (6400,))

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

In [None]:
#copy default stopwords
custom_stopwords = nlp.Defaults.stop_words.copy()

keep_words = {
        "you", "your", "this", "that", "what", "how",
    "not", "no", "never"
}

custom_stopwords = custom_stopwords - keep_words

In [None]:
custom_stopwords

{"'d",
 "'ll",
 "'m",
 "'re",
 "'s",
 "'ve",
 'a',
 'about',
 'above',
 'across',
 'after',
 'afterwards',
 'again',
 'against',
 'all',
 'almost',
 'alone',
 'along',
 'already',
 'also',
 'although',
 'always',
 'am',
 'among',
 'amongst',
 'amount',
 'an',
 'and',
 'another',
 'any',
 'anyhow',
 'anyone',
 'anything',
 'anyway',
 'anywhere',
 'are',
 'around',
 'as',
 'at',
 'back',
 'be',
 'became',
 'because',
 'become',
 'becomes',
 'becoming',
 'been',
 'before',
 'beforehand',
 'behind',
 'being',
 'below',
 'beside',
 'besides',
 'between',
 'beyond',
 'both',
 'bottom',
 'but',
 'by',
 'ca',
 'call',
 'can',
 'cannot',
 'could',
 'did',
 'do',
 'does',
 'doing',
 'done',
 'down',
 'due',
 'during',
 'each',
 'eight',
 'either',
 'eleven',
 'else',
 'elsewhere',
 'empty',
 'enough',
 'even',
 'ever',
 'every',
 'everyone',
 'everything',
 'everywhere',
 'except',
 'few',
 'fifteen',
 'fifty',
 'first',
 'five',
 'for',
 'former',
 'formerly',
 'forty',
 'four',
 'from',
 'fron

In [None]:
import re

def clean_text(text):
  text = text.lower()
  text = re.sub(r"http\S+|www\S+", "", text)
  doc = nlp(text)

  tokens = []
  for token in doc:
    if token.is_punct or token.is_space:
      continue

    if token.text in custom_stopwords:
      continue

    if token.like_num:
      tokens.append(token.text)
      # print(token.text)
      continue

    if token.like_num and len(token.text) > 2:
      continue


    lemma = token.lemma_.strip()

    if lemma:
      tokens.append(lemma)

  return " ".join(tokens)

In [None]:
sample_headlines = [
    "10 Shocking Things You Won’t Believe Doctors Say!",
    "Government releases new economic growth data",
    "This simple trick can save you thousands"
]

for h in sample_headlines:
    print("ORIGINAL:", h)
    print("CLEANED :", clean_text(h))
    print("-" * 50)


ORIGINAL: 10 Shocking Things You Won’t Believe Doctors Say!
CLEANED : 10 shocking thing you will believe doctor
--------------------------------------------------
ORIGINAL: Government releases new economic growth data
CLEANED : government release new economic growth datum
--------------------------------------------------
ORIGINAL: This simple trick can save you thousands
CLEANED : this simple trick save you thousand
--------------------------------------------------


In [None]:
X_train_clean = X_train.apply(clean_text)
X_test_clean = X_test.apply(clean_text)

In [None]:
for i in range(3):
    print("ORIGINAL:", X_train.iloc[i])
    print("CLEANED :", X_train_clean.iloc[i])
    print("-" * 60)

ORIGINAL: Can You Identify The Drink When Out Of Its Bottle
CLEANED : you identify drink bottle
------------------------------------------------------------
ORIGINAL: This 3-Year-Old Knows Way More About The Periodic Table Than You
CLEANED : this 3 year old know way periodic table you
------------------------------------------------------------
ORIGINAL: Ava DuVernay Would Like To Remind You Diversity Is Not That Hard
CLEANED : ava duvernay like remind you diversity not that hard
------------------------------------------------------------


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    min_df = 2,
    max_df = 0.95
)

In [None]:
X_train_tfidf = tfidf.fit_transform(X_train_clean)
X_test_tfidf  = tfidf.transform(X_test_clean)


In [None]:
feature_names = tfidf.get_feature_names_out()

print(feature_names[:30])


['00' '00 disney' '00 girl' '00 kid' '00 pop' '00 teen' '00 that' '000'
 '000 job' '000 people' '000 strand' '000 troop' '000 year' '00s'
 '00s disney' '05' '08' '08 bundesliga' '08 uefa' '09' '10' '10 000'
 '10 2008' '10 celebrity' '10 day' '10 delicious' '10 insanely' '10 life'
 '10 million' '10 minute']


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=1000,
    n_jobs=-1
)

model.fit(X_train_tfidf, y_train)


In [None]:
from sklearn.metrics import classification_report, accuracy_score

y_pred = model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.95875
              precision    recall  f1-score   support

           0       0.94      0.97      0.96      3200
           1       0.97      0.94      0.96      3200

    accuracy                           0.96      6400
   macro avg       0.96      0.96      0.96      6400
weighted avg       0.96      0.96      0.96      6400



In [None]:
feature_names = tfidf.get_feature_names_out()
coefficients = model.coef_[0]


In [None]:
top_clickbait = sorted(
    zip(feature_names, coefficients),
    key=lambda x: x[1],
    reverse=True
)[:20]

for word, coef in top_clickbait:
    print(f"{word:<25} {coef:.3f}")


you                       16.319
this                      9.871
your                      9.272
that                      6.806
people                    6.668
what                      6.584
thing                     6.227
how                       6.037
2015                      5.795
17                        5.514
21                        5.361
19                        4.749
actually                  4.524
photo                     4.061
23                        4.050
good                      4.005
guy                       3.927
18                        3.895
15                        3.740
time                      3.699


In [None]:
top_non_clickbait = sorted(
    zip(feature_names, coefficients),
    key=lambda x: x[1]
)[:20]

for word, coef in top_non_clickbait:
    print(f"{word:<25} {coef:.3f}")


kill                      -4.447
die                       -3.717
obama                     -3.504
uk                        -3.450
china                     -3.134
dead                      -3.002
report                    -2.984
court                     -2.938
win                       -2.936
iraq                      -2.934
australian                -2.863
crash                     -2.837
charge                    -2.786
president                 -2.775
police                    -2.693
arrest                    -2.606
launch                    -2.576
fire                      -2.536
leader                    -2.521
iran                      -2.430


In [None]:
def predict_clickbait(headline):
    cleaned = clean_text(headline)
    vectorized = tfidf.transform([cleaned])

    prediction = model.predict(vectorized)[0]
    probability = model.predict_proba(vectorized)[0][1]

    return prediction, probability


In [None]:
tests = [
    "10 shocking things you won’t believe doctors say",
    "Government releases quarterly economic report",
    "This simple trick can save you money",
    "Prime Minister meets foreign delegation"
]

for t in tests:
    label, prob = predict_clickbait(t)
    print(f"{t}")
    print(f"Clickbait: {label}, Confidence: {prob:.2f}")
    print("-" * 60)


10 shocking things you won’t believe doctors say
Clickbait: 1, Confidence: 0.99
------------------------------------------------------------
Government releases quarterly economic report
Clickbait: 0, Confidence: 0.01
------------------------------------------------------------
This simple trick can save you money
Clickbait: 1, Confidence: 0.98
------------------------------------------------------------
Prime Minister meets foreign delegation
Clickbait: 0, Confidence: 0.11
------------------------------------------------------------


In [None]:
import numpy as np

misclassified = X_test[y_test != y_pred]

for i in range(5):
    print("Headline:", misclassified.iloc[i])
    print("Actual:", y_test.iloc[i])
    pred, prob = predict_clickbait(misclassified.iloc[i])
    print("Predicted:", pred, "Confidence:", prob)
    print("-" * 60)


Headline: Congrats, Denzel Washington! Actor To Receive The Cecil B. DeMille Award
Actual: 1
Predicted: 0 Confidence: 0.24263393496295363
------------------------------------------------------------
Headline: Kate Middleton Has A Strange Wave And It Needs To Be Addressed
Actual: 1
Predicted: 0 Confidence: 0.4958266345186503
------------------------------------------------------------
Headline: Salted Coffee: Is It Less Bitter
Actual: 1
Predicted: 0 Confidence: 0.36904531066872726
------------------------------------------------------------
Headline: Can Americans Pass The UK Driving Test
Actual: 1
Predicted: 0 Confidence: 0.11739413877264794
------------------------------------------------------------
Headline: Gwen Stefani Adds Fire To Those Blake Shelton Dating Rumors: "He's Hot"
Actual: 1
Predicted: 0 Confidence: 0.4583991957167225
------------------------------------------------------------
