In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import difflib
import spacy
import scipy
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
from nltk.stem.porter import *
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



In [2]:
# Load the original data
full_data = pd.read_csv("Full data.csv")[["product_id", "description", "brand_category"]].fillna('UNKNOWN_TOKEN')
combination = pd.read_csv("outfit_combinations.csv", encoding = 'utf-8').fillna('UNKNOWN_TOKEN')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
full_data.drop(full_data[(full_data['description'] == 'UNKNOWN_TOKEN')].index, inplace = True)
# replace 'accessory2' and 'accessory3' to 'accessory'
combination.outfit_item_type[(combination.outfit_item_type == 'accessory1') | (combination.outfit_item_type == 'accessory2') | (combination.outfit_item_type == 'accessory3' )] = 'accessory'

In [4]:
full_data.drop_duplicates(inplace = True)
full_data

Unnamed: 0,product_id,description,brand_category
0,01DSRPSZTDW2PGK1YWYXJGKZZ0,Vintage Fitness leather sneakers with logo pri...,TheMensStore/Shoes/Sneakers/LowTop
2,01DPGV8TGRAB993PF7Z3YWG2VR,A Timeless Leather Belt Crafted From Smooth Co...,Accessories
3,01DSR8G3F7DBRTMP8THF97XSQ2,Pretty ruffle sleeves and trim elevate essenti...,"JustKids/Girls214/Girls/SwimwearCoverups,JustK..."
4,01DSR8G5GP519DEDCSKBMWQVK5,Versatile convertible gown with elephant applique,JustKids/Baby024months/InfantGirls/FootiesRompers
5,01DSR8GF05981EG88DHBETKXMR,From the Savage Love Collection. Fingerless kn...,JewelryAccessories/Accessories/Gloves
...,...,...,...
42368,01DT5110V2ME7BY3JPCAJ91QRW,Mélange beige and cream wool Button fastenings...,Clothing / Coats / Long
42369,01DV6M2FXMPW9RSZWSXW1EK75W,"Cream georgette Ties at neck, concealed hook f...",Clothing / Tops / Blouses
42370,01DV71D0XMBM4VVJK54F2HD3ZG,Sand cotton-corduroy Concealed hook and zip fa...,Clothing / Skirts / Mini
42371,01DV72R28K2A1AN5G167S6QRWF,Although mom jeans and boyfriend jeans are all...,women:CLOTHING:JEANS


full_data.reset_index(inplace = True)

In [5]:
def regex_cleaning(description):
    """
    Obtain a TFIDF vector for a given product
    
    Parameters:
    arg1(string): The description of the product
    arg2(string): The details of the product
    arg3(string): The full_name of the product
    arg4(string): IDF score calculated from the training data
    
    Return the TFIDF vector
    """
    ## Function to remove special characters
    lemmatizer = WordNetLemmatizer()
    ## Function to convert nltk tag to wordnet tag
    def nltk2wn_tag(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None
    ## Function to lemmatize
    def lemmatize_sentence(sentence):
        lemmatizer = WordNetLemmatizer()
        nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
        wn_tagged = map(lambda x :(x[0], nltk2wn_tag(x[1])), nltk_tagged)
        res_words = []
        for word, tag in wn_tagged:
            if tag is None:
                res_words.append(word)
            else:
                res_words.append(lemmatizer.lemmatize(word,tag))
        return " ".join(res_words)
    ## Difine a function to calculate the term frequency array
    def all_list(arr):
        result = {}
        for i in set(arr):
            result[i] = arr.count(i)
        return result
    
    ## Define different stopwords lists for three different inputs
    stopword = set(stopwords.words('english'))
    stopword_des = stopwords.words('english') + ['.' , '', ]
    
    # Regax cleaning for description
    des = re.sub(r'[^\w+0-9%.]', ' ', description, flags = re.IGNORECASE)
    ## Replace all numbers with % with text description (HIGH/MEDIAN/LOW)
    perc = re.findall(r'(\d+)(?:%)\s', des)
    temp_content = ""
    temp = []
    if perc != []:
        for i in perc:
            if int(i) >= 80:
                temp_content = temp_content + "HIGH"
            elif int(i) >= 40:
                temp_content = temp_content + "MEDIAN"
            else:
                temp_content = temp_content + "LOW"
            temp.append(temp_content)
            temp_content = ""
    des = re.sub(r"(\d+)(?:%)\s(\w+)", "MATERIAL", des, flags = re.IGNORECASE)
    flag = 0
    temp_list = []
    for word in des.split(" "):
        if word == "MATERIAL":
            temp_list.append("{}_MATERIAL".format(temp[flag]))
            flag += 1
        else:
            temp_list.append(word)
    temp_des = " ".join(temp_list)
    des = re.sub(r"((\d+\.\d+)|(\d+))", "NUMERICAL_VALUE", temp_des, flags = re.IGNORECASE)
    ## Remove stopwords, unnecessary punctuations and lower cases
    updated_des = []
    temp_des = []
    line = re.findall(r'\b[a-zA-Z0-9_]{2,}\b',des)
    for word in line:
        if word.lower() in stopword_des:
            continue
        temp_des.append(word.lower())
    updated_des = " ".join(temp_des)
    ## Lemmatize the description
    updated_des = lemmatize_sentence(updated_des)
    
    return(updated_des)

In [6]:
description_clean = []
for description in full_data['description']:
    description_clean.append(regex_cleaning(description))
    

# Create Sentence Vector of each Description in Full Data

In [7]:
docs = [word_tokenize(sentence) for sentence in description_clean]
model = Word2Vec(docs, min_count = 1, window = 5, size = 500)

In [8]:
product_vector = []
for item in description_clean:
    temp_vector = np.zeros((1,500))
    n = 0
    for word in item.split(" "):
        if word in model.wv.vocab:
            temp_vector += model.wv.get_vector(word)
        else:
            temp_vector += np.random.normal(size=500)
        n += 1
    product_vector.append(temp_vector/n)

# Fuzzywozzy

In [9]:
# Regax for 5 categories, cleaning the full data
top = r"\b(top|tops|shirt|shirts|coat|coats|jacket|jackets|blouse|blouses|blazer|sweaters|sweater|knitwear|bodysuits|polos)\b"
bottom = r"\b(bottom|trousers|pant|pants|jeans|shorts|skirts|leggings|legging|camisoles and chemises)\b"
one_piece = r"\b(one-piece|jumpsuits|rompers|dungarees|dresses|pajamas_intimates|pajamas)\b"
shoe = r"\b(shoe|shoes|sneakers|cleats|high heels|boot|boots|mukluks|flip-flops|suits\
        galoshes|clogs|loafers|pumps|moccasins|sandals|skates|waders|zoris|mule)\b"
accessory = r"\b(accessory|accessories|sunglasses|bag|bags|handbag|handbags|scarf|scarfs|scarves|belt|belts|necklaces|jewelry|brooch|hat|hats|cap|caps)\b"
useless_regax = "homehitech|swimwear|coverups|saksbeautyplace|toys|justkids|themensstore"

full = full_data.copy()

full["brand_category"] = full["brand_category"].apply(lambda x: x.lower())
full["description"] = full["description"].apply(lambda x: str(x))
full["description"] = full["description"].apply(lambda x: x.lower())

full["useless_regax"] = full["brand_category"].str.findall(useless_regax) + full["description"].str.findall(useless_regax)
full["useless_regax"] = full["useless_regax"].apply(lambda x: "yes" if x != [] else [])

full = full[full["useless_regax"] != "yes"]

full["top_result"] = full["brand_category"].str.findall(top)
full["top_result"] = full["top_result"].apply(lambda x: ["top"] if x != [] else [])

full["bottom_result"] = full["brand_category"].str.findall(bottom)
full["bottom_result"] = full["bottom_result"].apply(lambda x: ["bottom"] if x != [] else [])

full["one_piece_result"] = full["brand_category"].str.findall(one_piece)
full["one_piece_result"] = full["one_piece_result"].apply(lambda x: ["one piece"] if x != [] else [])

full["shoe_result"] = full["brand_category"].str.findall(shoe)
full["shoe_result"] = full["shoe_result"].apply(lambda x: ["shoe"] if x != [] else [])

full["accessory_result"] = full["brand_category"].str.findall(accessory)
full["accessory_result"] = full["accessory_result"].apply(lambda x: ["accessory"] if x != [] else [])

full["result"] = full["top_result"] + full["bottom_result"] + full["one_piece_result"] + full["shoe_result"] + full["accessory_result"]
full["result"] = full["result"].apply(lambda x: "{}".format(x) if x != [] else 0)
    
final = full[full["result"] != 0]
final = final[final["result"] != "['bottom', 'one piece']"]
final = final[final["result"] != "['top', 'shoe']"]
final = final[final["result"] != "['top', 'accessory']"]
final = final[final["description"] != "nan"]
final = final[["product_id", "description", "brand_category", "result"]].reset_index(drop = True)
final["result"] = final["result"].apply(lambda x: x[2:-2])

# Create function to output the desire outfit combination

We consider the input as two different scenarios. 

If input is an ID
1. compare which product ID is most close to the product ID in combination data using Fuzzywuzzy
2. If Fuzzywuzzy score greater than 80, we would take the first product id with one of its outfit id as our reference
    2.1. output the outfit combination corresponding to that outfit it
3. If Fuzzywuzzy score less than 80
    3.1. turn to the full data to find out the highest fuzzywuzzy score with same type as input
    3.2. save that description of that product with same item type and pass this description and input type to our second scenarios
    
If input is a description
1. compare the input description with all the product description using cosine similiarity
2. find the highest scores (< 0.9999) and find the corresponding outfit id (random one)
3. output the outfit combination result


In [10]:
def cos_sim_recommend(full_data, product_vector):
    """
    Please enter a product description with valid format,  " "'text', 'text' ", without parentheses
    this function will recommend suitable apparel combination based on the input styling.
    """
    item = pd.Series(input('please input your product: '))
    item = item.str.findall(r'\b\w+\b')
    item_type = item[0][0]
    item_des = " ".join(item[0][1:])
    temp_wordvec = np.zeros((1,500))
    input_vector = []
    index = 0
    
    if len(item_des.split(' ')) == 1:
        combination['score'] = combination['product_id'].apply(lambda x: fuzz.token_sort_ratio(x, item_des))
        combination.sort_values(by = "score", ascending = False, inplace = True)
        combination.reset_index(drop = True, inplace = True)
        if combination["score"][0] >= 80:
            outfit_id = combination[combination["score"] == combination["score"][0]]["outfit_id"][0]
            recommendation = combination[combination["outfit_id"] == outfit_id][["outfit_id", "product_full_name", "outfit_item_type"]]
            for outfittype in recommendation["outfit_item_type"]:
                recommendation = combination[combination["outfit_id"] == outfit_id][["outfit_id", "product_full_name", "outfit_item_type"]]
                print("{} :{} ({})".format(outfittype, recommendation[recommendation["outfit_item_type"] == outfittype]["product_full_name"].values, \
                                       recommendation[recommendation["outfit_item_type"] == outfittype]["outfit_id"].values, ))
        else:
            temp_df = final[final["result"] == item_type]
            temp_df["score"] = temp_df["product_id"].apply(lambda x: fuzz.token_sort_ratio(x, item_des))
            temp_df.sort_values(by = "score", ascending = False, inplace = True)
            temp_df.reset_index(drop = True, inplace = True)
            item_des = temp_df[temp_df["score"] == temp_df["score"][0]]["description"][0]
            
    # when there is an input, transfer them into word2vec based on the exisiting word2vec rules (average)
    for word in regex_cleaning(item_des).split(" "):
        if word in model.wv.vocab:
            temp_wordvec += model.wv.get_vector(word)
        else:
            temp_wordvec += np.random.normal(size = 500)
        index += 1
    input_vector.append(temp_wordvec / index)
    
    # calculate cos similiarity score
    cos_sim = []
    for sentence in range(len(full_data)):
        detail = product_vector[sentence]
        similarity = 1 - scipy.spatial.distance.cosine(input_vector, detail)
        cos_sim.append(similarity)
    cos_sim = pd.DataFrame(cos_sim, index = full_data.index, columns = ['score'])
    cos_sim['product_id'] = full_data['product_id']
    
    # filter out the most closed similar one
    cos_sim = cos_sim[cos_sim.score < 0.99999]
    # sort based on the score
    cos_sim = cos_sim.sort_values(by = ['score'], ascending = False)  
    
    # only select outfit_id(index) which satisfy the highest cos sim scores, has more than 3 outfit combination, and has same type as input
    fit_id = 0
    for index in cos_sim.product_id:
        if len(combination.outfit_id[combination.product_id == index]) > 0:
            if len(combination[combination.outfit_id == combination.outfit_id[combination.product_id == index].iloc[0]]) >= 3 and combination.outfit_item_type[combination.product_id == index].iloc[0] == item_type:
                fit_id = combination.outfit_id[combination.product_id == index].iloc[0]
                break
    comb = combination[combination.outfit_id == fit_id]
    final_comb = pd.DataFrame()
    
    # randomly select one of the duplicate accessory
    for type in comb.outfit_item_type.unique():
        temp = comb[comb.outfit_item_type==type]
        if temp.shape[0]==1:
            final_comb = final_comb.append(temp,ignore_index=True,sort=False)
        else:
            temp = temp.iloc[0,:]
            final_comb = final_comb.append(temp,ignore_index=True,sort=False)
    for index in final_comb.index:
        print(final_comb.loc[index, 'outfit_item_type'], ':', final_comb.loc[index, 'product_full_name'], '(', final_comb.loc[index, 'outfit_id'],')')


Please run through all the Jupyter Notebook and function 'cos_sim_recommend' with two prepared input, full_data and product_vector

The Notebook will process two input data, 'Full data.csv' and 'outfit_combinations.csv'

This function will output a set of recommended apparel combination based on the input styling.

In [None]:
cos_sim_recommend(full_data, product_vector)