<a href="https://colab.research.google.com/github/Anjal08/Compare-Vectorisation-method-/blob/main/CompareVectorisation_CountVectoriser_Tfidf_word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import random

podcast_topics = [
    "technology", "health", "startup journey", "motivation", "science",
    "relationships", "productivity", "education", "business", "finance",
    "mental health", "travel", "sports", "history", "self improvement"
]

def generate_sentence():
    subjects = [
        "In today's episode", "Our guest explains", "Let’s talk about",
        "One of the major topics is", "A common question we get is",
        "People often wonder", "Today we will discuss", "Here’s something interesting about"
    ]
    details = [
        "how this impacts our daily lives",
        "why this matters more today than ever",
        "the challenges people face",
        "the latest research findings",
        "common mistakes everyone makes",
        "a real story that illustrates this point"
    ]
    endings = [
        "so stay tuned.", "let's dive deep.", "and this changes everything.",
        "which many people don’t realize.", "and it's fascinating to explore."
    ]

    return f"{random.choice(subjects)} {random.choice(podcast_topics)}, {random.choice(details)} {random.choice(endings)}"

# generate 10,000 rows
data = [generate_sentence() for _ in range(10000)]

df = pd.DataFrame({"text": data})
df.head(), df.shape


(                                                text
 0  One of the major topics is education, why this...
 1  Today we will discuss sports, a real story tha...
 2  Today we will discuss health, a real story tha...
 3  Let’s talk about finance, how this impacts our...
 4  Here’s something interesting about relationshi...,
 (10000, 1))

In [None]:
df.to_csv("podcast_10k.csv", index=False)


In [None]:
from google.colab import files
files.download("podcast_10k.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
len(df)


10000

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from gensim.models import Word2Vec


In [None]:
df.head(), df.shape


(                                                text
 0  One of the major topics is education, why this...
 1  Today we will discuss sports, a real story tha...
 2  Today we will discuss health, a real story tha...
 3  Let’s talk about finance, how this impacts our...
 4  Here’s something interesting about relationshi...,
 (10000, 1))

In [None]:
df["label"] = (np.random.rand(len(df)) > 0.5).astype(int)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)


In [None]:
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)
X_test_cv = cv.transform(X_test)

clf = LogisticRegression(max_iter=200)
clf.fit(X_train_cv, y_train)
cv_pred = clf.predict(X_test_cv)

acc_cv = accuracy_score(y_test, cv_pred)
print("CountVectorizer Accuracy:", acc_cv)


CountVectorizer Accuracy: 0.5015


In [None]:
tfidf = TfidfVectorizer()
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

clf = LogisticRegression(max_iter=200)
clf.fit(X_train_tfidf, y_train)
tfidf_pred = clf.predict(X_test_tfidf)

acc_tfidf = accuracy_score(y_test, tfidf_pred)
print("TF-IDF Accuracy:", acc_tfidf)


TF-IDF Accuracy: 0.5015


In [None]:
def tokenize(text):
    return text.lower().split()

X_train_tokens = [tokenize(t) for t in X_train]
X_test_tokens = [tokenize(t) for t in X_test]


In [None]:
w2v_model = Word2Vec(sentences=X_train_tokens, vector_size=100, window=5, min_count=1)


In [None]:
def sentence_vector(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)


In [None]:
X_train_w2v = np.array([sentence_vector(tokens, w2v_model) for tokens in X_train_tokens])
X_test_w2v = np.array([sentence_vector(tokens, w2v_model) for tokens in X_test_tokens])


In [None]:
clf = LogisticRegression(max_iter=500)
clf.fit(X_train_w2v, y_train)
w2v_pred = clf.predict(X_test_w2v)

acc_w2v = accuracy_score(y_test, w2v_pred)
print("Word2Vec Accuracy:", acc_w2v)


Word2Vec Accuracy: 0.4895


In [None]:
print("\n=== FINAL PERFORMANCE COMPARISON ===")
print(f"CountVectorizer : {acc_cv:.4f}")
print(f"TF-IDF          : {acc_tfidf:.4f}")
print(f"Word2Vec avg    : {acc_w2v:.4f}")



=== FINAL PERFORMANCE COMPARISON ===
CountVectorizer : 0.5015
TF-IDF          : 0.5015
Word2Vec avg    : 0.4895
