In [1]:

import nltk
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
# Download the 'punkt_tab' resource explicitly
nltk.download('punkt_tab') # This line ensures the 'punkt_tab' resource is downloaded

# Load dataset
file_path = r'C:\nlp\DataNeuron_Text_Similarity.csv'  # Update the correct path if needed
try:
    df = pd.read_csv(file_path)
    assert {'text1', 'text2'}.issubset(df.columns), "Dataset must contain 'text1' and 'text2' columns."
except Exception as e:
    print(f"Error loading dataset: {e}")
    exit()

print("Dataset Loaded. First few rows:")
print(df.head())

# Text Cleaning Function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if pd.isna(text): return ""  # Handle missing values
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

# Apply text cleaning
df['clean_text1'] = df['text1'].apply(clean_text)
df['clean_text2'] = df['text2'].apply(clean_text)

# TF-IDF Vectorization & Cosine Similarity
vectorizer = TfidfVectorizer()
tfidf_matrix1 = vectorizer.fit_transform(df['clean_text1'])
tfidf_matrix2 = vectorizer.transform(df['clean_text2'])

tfidf_cosine_sim = np.array([
    cosine_similarity(tfidf_matrix1[i], tfidf_matrix2[i])[0][0] 
    for i in range(tfidf_matrix1.shape[0])
])

# SBERT Model for Sentence Embeddings
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
embeddings1 = sbert_model.encode(df['clean_text1'].tolist(), convert_to_numpy=True)
embeddings2 = sbert_model.encode(df['clean_text2'].tolist(), convert_to_numpy=True)

sbert_cosine_sim = np.array([
    cosine_similarity([embeddings1[i]], [embeddings2[i]])[0][0] 
    for i in range(len(embeddings1))
])

# Average Both Scores for Final Similarity
df['similarity_score'] = (tfidf_cosine_sim + sbert_cosine_sim) / 2

# Save Model & Vectorizer
with open('similarity_model.pkl', 'wb') as f:
    pickle.dump(sbert_model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print("Model and vectorizer saved successfully.")
print("First few results:")
print(df[['text1', 'text2', 'similarity_score']].head())

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Dataset Loaded. First few rows:
                                               text1  \
0  broadband challenges tv viewing the number of ...   
1  rap boss arrested over drug find rap mogul mar...   
2  player burn-out worries robinson england coach...   
3  hearts of oak 3-2 cotonsport hearts of oak set...   
4  sir paul rocks super bowl crowds sir paul mcca...   

                                               text2  
0  gardener wins double in glasgow britain s jaso...  
1  amnesty chief laments war failure the lack of ...  
2  hanks greeted at wintry premiere hollywood sta...  
3  redford s vision of sundance despite sporting ...  
4  mauresmo opens with victory in la amelie maure...  


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Model and vectorizer saved successfully.
First few results:
                                               text1  \
0  broadband challenges tv viewing the number of ...   
1  rap boss arrested over drug find rap mogul mar...   
2  player burn-out worries robinson england coach...   
3  hearts of oak 3-2 cotonsport hearts of oak set...   
4  sir paul rocks super bowl crowds sir paul mcca...   

                                               text2  similarity_score  
0  gardener wins double in glasgow britain s jaso...          0.127056  
1  amnesty chief laments war failure the lack of ...          0.089235  
2  hanks greeted at wintry premiere hollywood sta...          0.136216  
3  redford s vision of sundance despite sporting ...          0.083083  
4  mauresmo opens with victory in la amelie maure...          0.121336  
