## Pre-Processing

In [1]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Function to preprocess text
def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Convert text to lowercase
    lowercased_text = text.lower()
    
    # Remove punctuation from text
    punctuation_removed_text = re.sub(r'[^\w\s]', '', lowercased_text)
    
    # Remove stop words from tokens
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    # Remove special characters and numbers from text
    special_characters_removed_text = re.sub(r'[^a-zA-Z\s]', '', punctuation_removed_text)
    
    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    # Expand contractions in text
    contractions = {
        "don't": "do not",
        "can't": "cannot",
        # Add more contractions as needed
    }
    expanded_text = text
    for contraction, expansion in contractions.items():
        expanded_text = expanded_text.replace(contraction, expansion)
    
    # Remove extra whitespace from text
    cleaned_text = re.sub(' +', ' ', special_characters_removed_text)
    
    preprocessed_text = {
        "tokens": tokens,
        "lowercased_text": lowercased_text,
        "punctuation_removed_text": punctuation_removed_text,
        "filtered_tokens": filtered_tokens,
        "special_characters_removed_text": special_characters_removed_text,
        "lemmatized_tokens": lemmatized_tokens,
        "expanded_text": expanded_text,
        "cleaned_text": cleaned_text
    }
    
    return cleaned_text


[nltk_data] Downloading package punkt to /home/anshal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/anshal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/anshal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/anshal/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
dataframe = pd.read_csv("fashion_data.csv")
dataframe.head()

Unnamed: 0,link,image_url,text
0,https://www.bewakoof.com/p/mens-beige-camoufla...,https://images.bewakoof.com/t640/men-s-beige-c...,Men's Beige and Blue Camouflage Hooded Jacket-...
1,https://www.bewakoof.com/p/mens-black-hooded-j...,https://images.bewakoof.com/t640/men-s-black-h...,Men's Black Hooded Jacket-Front Bewakoof
2,https://www.bewakoof.com/p/mens-black-hooded-j...,https://images.bewakoof.com/t640/men-s-black-h...,Men's Black Hooded Jacket-Front Bewakoof
3,https://www.bewakoof.com/p/mens-green-hooded-s...,https://images.bewakoof.com/t640/men-s-green-h...,Men's Green Hooded Sweatshirt-Front Bewakoof
4,https://www.bewakoof.com/p/mens-brown-color-bl...,https://images.bewakoof.com/t640/men-s-brown-c...,Men's Brown Color Block Hooded Sweatshirt-Fron...


In [3]:
database_sequences = dataframe["text"].map(preprocess_text)

In [4]:
database_sequences[-13334:10]

0    mens beige and blue camouflage hooded jacketfr...
1              mens black hooded jacketfront bewakoof 
2              mens black hooded jacketfront bewakoof 
3          mens green hooded sweatshirtfront bewakoof 
4    mens brown color block hooded sweatshirtfront ...
5    mens yellow all over jaipuri printed shirtfron...
6    mens grey color block hooded sweatshirtfront b...
7    mens grey color block hooded sweatshirtfront b...
8    mens blue striped hooded sweatshirtfront bewak...
9    mens black graphic printed hooded sweatshirtfr...
Name: text, dtype: object

In [5]:
# Input sentence
input_sentence = "white striped T-shirt"
input_sentence = preprocess_text(input_sentence)

# Vectorize the sentences
vectorizer = TfidfVectorizer()
sentence_vectors = vectorizer.fit_transform(database_sequences.values.tolist())
input_vector = vectorizer.transform([input_sentence])

In [6]:
input_vector.dtype

dtype('float64')

In [7]:
# Calculate cosine similarity
cosine_similarities = cosine_similarity(input_vector, sentence_vectors).flatten()

# Get the indices of the most similar sentences
most_similar_indices = cosine_similarities.argsort()[:-6:-1]

# Get the most similar sentences
most_similar_sentences = dataframe['text'].iloc[most_similar_indices]

print(most_similar_sentences)

7357    Women's White Striped T-shirt-Front Bewakoof 
8127    Women's White Striped T-shirt-Front Bewakoof 
6567    Women's White Striped T-shirt-Front Bewakoof 
5837    Women's White Striped T-shirt-Front Bewakoof 
9797    Women's White Striped T-shirt-Front Bewakoof 
Name: text, dtype: object


In [8]:
most_similar_indices

array([7357, 8127, 6567, 5837, 9797])

In [9]:
test = dataframe.iloc[most_similar_indices]

In [10]:
data = dataframe[["link", "image_url"]]

In [11]:
data

Unnamed: 0,link,image_url
0,https://www.bewakoof.com/p/mens-beige-camoufla...,https://images.bewakoof.com/t640/men-s-beige-c...
1,https://www.bewakoof.com/p/mens-black-hooded-j...,https://images.bewakoof.com/t640/men-s-black-h...
2,https://www.bewakoof.com/p/mens-black-hooded-j...,https://images.bewakoof.com/t640/men-s-black-h...
3,https://www.bewakoof.com/p/mens-green-hooded-s...,https://images.bewakoof.com/t640/men-s-green-h...
4,https://www.bewakoof.com/p/mens-brown-color-bl...,https://images.bewakoof.com/t640/men-s-brown-c...
...,...,...
9995,https://www.bewakoof.com/p/sleeveless-denim-ja...,https://images.bewakoof.com/t640/sleeveless-ic...
9996,https://www.bewakoof.com/p/stay-motivated-stri...,https://images.bewakoof.com/t640/stay-motivate...
9997,https://www.bewakoof.com/p/cream-ww-women-half...,https://images.bewakoof.com/t640/women-s-half-...
9998,https://www.bewakoof.com/p/peach-ww-women-half...,https://images.bewakoof.com/t640/women-s-half-...


In [12]:
dataframe.shape

(10000, 3)

In [13]:
unique_df = dataframe.drop_duplicates(subset='text')

In [23]:
unique_df.shape

(51, 3)

In [27]:
json_val = unique_df.to_json(orient='records', force_ascii=False)

In [28]:
json_val

'[{"link":"https:\\/\\/www.bewakoof.com\\/p\\/mens-beige-camouflage-hooded-jacket-34","image_url":"https:\\/\\/images.bewakoof.com\\/t640\\/men-s-beige-camouflage-hooded-jacket-545776-1664353175-1.jpg","text":"Men\'s Beige and Blue Camouflage Hooded Jacket-Front Bewakoof "},{"link":"https:\\/\\/www.bewakoof.com\\/p\\/mens-black-hooded-jacket-30","image_url":"https:\\/\\/images.bewakoof.com\\/t640\\/men-s-black-hooded-jacket-545773-1664351838-1.jpg","text":"Men\'s Black Hooded Jacket-Front Bewakoof "},{"link":"https:\\/\\/www.bewakoof.com\\/p\\/mens-green-hooded-sweatshirt-men","image_url":"https:\\/\\/images.bewakoof.com\\/t640\\/men-s-green-hooded-sweatshirt-545926-1664439533-1.jpg","text":"Men\'s Green Hooded Sweatshirt-Front Bewakoof "},{"link":"https:\\/\\/www.bewakoof.com\\/p\\/mens-brown-color-block-hooded-sweatshirt","image_url":"https:\\/\\/images.bewakoof.com\\/t640\\/men-s-brown-color-block-hooded-sweatshirt-545940-1664439496-1.jpg","text":"Men\'s Brown Color Block Hooded Swe