In [None]:
import numpy as np
import pandas as pd
import re
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk

# Download tokenizer
nltk.download('punkt')

# Load dataset
df = pd.read_csv("your_dataset.csv")  # Replace with actual dataset file

# Text Preprocessing
def preprocess(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'\W+', ' ', text)  # Remove special characters
    return text.strip()

df['cleaned_text'] = df['answer_column'].apply(preprocess)

# --------- Approach 1: TF-IDF + Cosine Similarity ---------
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df['cleaned_text'])

def compute_tfidf_similarity(text1, text2):
    vec1 = tfidf_vectorizer.transform([text1])
    vec2 = tfidf_vectorizer.transform([text2])
    return cosine_similarity(vec1, vec2)[0][0]

# --------- Approach 2: Word2Vec Embeddings + Cosine Similarity ---------
# Tokenize sentences
tokenized_texts = [word_tokenize(text) for text in df['cleaned_text']]

# Train Word2Vec Model
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

# Get sentence embedding (average of word vectors)
def get_sentence_embedding(text):
    words = word_tokenize(text)
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if not word_vectors:  # If no words are found in the model, return zero vector
        return np.zeros(100)
    return np.mean(word_vectors, axis=0)

def compute_word2vec_similarity(text1, text2):
    vec1 = get_sentence_embedding(text1)
    vec2 = get_sentence_embedding(text2)
    return cosine_similarity([vec1], [vec2])[0][0]

# --------- Example Usage ---------
text1 = "The cat is sleeping on the mat."
text2 = "A feline is resting on the carpet."

print("TF-IDF Similarity:", compute_tfidf_similarity(text1, text2))
print("Word2Vec Similarity:", compute_word2vec_similarity(text1, text2))
