In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Ensure you have the required nltk resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize necessary components
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenize and lowercase
    tokens = nltk.word_tokenize(text.lower())
    # Remove punctuation and stopwords, and lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return ' '.join(tokens)

# Sample data
laws_data = [
    "Prohibition on sale of food articles coated with mineral oil: No person shall sell or offer or expose for sale or have in his premises for the purpose of sale under any description, food articles which have been coated with mineral oil, except where the addition of mineral oil is permitted in accordance with the standards laid down in these Regulations and Food Safety and Standards (Food Products Standards and Food Additives) regulations, 2011.",
    "Restriction on sale of Carbia Callosa and Honey dew.:Carbia Callosa and Honey dew shall be sold only in sealed containers bearing Agmark seal.",
    "Food resembling but not pure honey not be marketed as honey: No person shall use the word ‘honey’ or any word, mark, illustration or device that suggests honey on the label or any package of, or in any advertisement for, any food that resembles honey but is not pure honey.",
    "Product not to contain any substance which may be injurious to health: Tobacco and nicotine shall not be used as ingredients in any food products.",
    "Prohibition of use of carbide gas in ripening of fruits: No person shall sell or offer or expose for sale or have in his premises for the purpose of sale under any description, fruits which have been artificially ripened by use of acetylene gas, commonly known as carbide gas."
]

laws_df = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'text': laws_data
})

# Preprocess the text
laws_df['processed_text'] = laws_df['text'].apply(preprocess_text)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(laws_df['processed_text'])

# User query
user_query = "honey"

# Preprocess the query
processed_query = preprocess_text(user_query)

# Convert the query to TF-IDF vector
query_vector = vectorizer.transform([processed_query])

# Calculate cosine similarity between the query and the laws
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

# Rank laws based on similarity
ranked_indices = cosine_similarities.argsort()[::-1]

# Retrieve the top relevant laws
top_laws = laws_df.iloc[ranked_indices]

# Filter laws with non-zero similarity
relevant_laws = top_laws[cosine_similarities[ranked_indices] > 0]

# Display the results
print(relevant_laws[['id', 'text']])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


   id                                               text
2   3  Food resembling but not pure honey not be mark...
1   2  Restriction on sale of Carbia Callosa and Hone...


In [2]:
user_query = "fruits"

# Preprocess the query
processed_query = preprocess_text(user_query)

# Convert the query to TF-IDF vector
query_vector = vectorizer.transform([processed_query])

# Calculate cosine similarity between the query and the laws
cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

# Rank laws based on similarity
ranked_indices = cosine_similarities.argsort()[::-1]

# Retrieve the top relevant laws
top_laws = laws_df.iloc[ranked_indices]

# Filter laws with non-zero similarity
relevant_laws = top_laws[cosine_similarities[ranked_indices] > 0]

# Display the results
print(relevant_laws[['id', 'text']])

   id                                               text
4   5  Prohibition of use of carbide gas in ripening ...
