In [250]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
from typing import Dict

# Regex Cleaning & Word Vectorization

In [402]:
def regex_cleaning(description, details, brand, product_full_name, idf_score):
    """
    Obtain a TFIDF vector for a given product
    
    Parameters:
    arg1(string): The description of the product
    arg2(string): The details of the product
    arg3(string): The full_name of the product
    arg4(string): IDF score calculated from the training data
    
    Return the TFIDF vector
    """
    ## Function to remove stopwords
    def remove_stopwords(title: str, stopword_list):
        tokens = nltk.word_tokenize(title)
        filtered_tokens = list(filter(lambda token: token not in stopword_list, tokens))
        return " ".join(filtered_tokens)
    ## Function to remove special characters
    def removeSpecialChars(z):
        return z.translate ({ord(c): "" for c in "!@#$%^&*()[]{};:,./'<>?\|`~-=_+"})
    lemmatizer = WordNetLemmatizer()
    ## Function to convert nltk tag to wordnet tag
    def nltk2wn_tag(nltk_tag):
        if nltk_tag.startswith('J'):
            return wordnet.ADJ
        elif nltk_tag.startswith('V'):
            return wordnet.VERB
        elif nltk_tag.startswith('N'):
            return wordnet.NOUN
        elif nltk_tag.startswith('R'):
            return wordnet.ADV
        else:
            return None
    ## Function to lemmatize
    def lemmatize_sentence(sentence):
        lemmatizer = WordNetLemmatizer()
        nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))
        wn_tagged = map(lambda x :(x[0], nltk2wn_tag(x[1])), nltk_tagged)
        res_words = []
        for word, tag in wn_tagged:
            if tag is None:
                res_words.append(word)
            else:
                res_words.append(lemmatizer.lemmatize(word,tag))
        return " ".join(res_words)
    ## Difine a function to calculate the term frequency array
    def all_list(arr):
        result = {}
        for i in set(arr):
            result[i] = arr.count(i)
        return result
    
    ## Define different stopwords lists for three different inputs
    stopword = set(stopwords.words('english'))
    stopword_des = stopwords.words('english') + ['.' , '', ]
    stopword_de = stopwords.words('english') + ["", "h", "l", "mm", "w", "x", "xl", "xs", "cm", "uv"]
    stopword_br = stopwords.words('english') + ['by', 'and', 'the', 'of', 'for', 'to', 'inc', '', 'a', 'an']
    stopword_pro = stopwords.words('english') + ['&','|',',','.',"'",'in','le','mm','and','with','the','k','x','les','de','of',"'s"]
    
    # Regax cleaning for description
    des = re.sub(r'[^\w+0-9%.]', ' ', description, flags = re.IGNORECASE)
    ## Replace all numbers with % with text description (HIGH/MEDIAN/LOW)
    perc = re.findall(r'(\d+)(?:%)\s', des)
    temp_content = ""
    temp = []
    if perc != []:
        for i in perc:
            if int(i) >= 80:
                temp_content = temp_content + "HIGH"
            elif int(i) >= 40:
                temp_content = temp_content + "MEDIAN"
            else:
                temp_content = temp_content + "LOW"
            temp.append(temp_content)
            temp_content = ""
    des = re.sub(r"(\d+)(?:%)\s(\w+)", "MATERIAL", des, flags = re.IGNORECASE)
    flag = 0
    temp_list = []
    for word in des.split(" "):
        if word == "MATERIAL":
            temp_list.append("{}_MATERIAL".format(temp[flag]))
            flag += 1
        else:
            temp_list.append(word)
    temp_des = " ".join(temp_list)
    des = re.sub(r"((\d+\.\d+)|(\d+))", "NUMERICAL_VALUE", temp_des, flags = re.IGNORECASE)
    ## Remove stopwords, unnecessary punctuations and lower cases
    updated_des = []
    temp_des = []
    line = re.findall(r'\b[a-zA-Z0-9_]{2,}\b',des)
    for word in line:
        if word.lower() in stopword_des:
            continue
        temp_des.append(word.lower())
    updated_des = " ".join(temp_des)
    ## Lemmatize the description
    updated_des = lemmatize_sentence(updated_des)
    
    # Regax cleaning for details
    ## Remove all the numbers punctuations and stopwords
    details = details.lower()
    details = re.sub(r"[^a-z\s]+", "", details, flags = re.IGNORECASE)
    details = re.sub(r"\r\n", "", details, flags = re.IGNORECASE)
    updated_details = remove_stopwords(details, stopword_de)
    ## Lemmatize the details
    updated_details = lemmatize_sentence(updated_details)
    
    # Regax cleaning for brand
    ## Lower cases and remove special characters
    brand = brand.lower()
    brand = re.sub(r"[\(\[].*?[\)\]]", "", brand)
    brand = removeSpecialChars(brand)
    ## Remove stopwords
    updated_brand = remove_stopwords(brand, stopword_br)
    ## Lemmatize the brand
    updated_brand = lemmatize_sentence(updated_brand)
    
    # Regax cleaning for product full name
    ## Lower cases
    product_full_name = product_full_name.lower()
    ## Remove hyphen
    product_full_name = re.sub("-", "", product_full_name, flags = re.IGNORECASE)
    ## Remove numbers
    product_full_name = re.sub(r'[0-9]+',"", product_full_name, flags = re.IGNORECASE)
    ## Remove stopwords
    updated_product_full_name = remove_stopwords(product_full_name, stopword_pro)
    ## Lemmatize the brand
    updated_product_full_name = lemmatize_sentence(updated_product_full_name)
    
    # Join the four features together
    result = [updated_des, updated_details, updated_brand, updated_product_full_name]
    result_str = " ".join(result)
    
    # Obtain the TF array for the given string
    tf = []
    result_str = result_str.split(" ")
    ## Calculate TF
    for word in result_str:
        tf.append(all_list(result_str)[word]/len(result_str))
    ## Calculate TFIDF score
    idf = pd.read_csv(idf_score).iloc[:, 1:]
    tf_idf_score = {}
    i = 0
    for word in result_str:
        if word in idf.columns:
            tf_idf_score[word] = tf[i]*idf.loc[0, word]
            i += 1
        else:
            tf_idf_score[word] = 0

    ## Get a 1*500 vector for the product
    tf_idf = []
    i = 0
    for col in idf.columns:
        if col in result_str:
            tf_idf.append(tf_idf_score[col])
            i += 1
        else:
            tf_idf.append(0)
            
    tf_idf = np.array(tf_idf)
    
    return(tf_idf)

In [409]:
def tagging_final(product,data1,product_lem_vector):
    """
    Obtain a TFIDF vector for a given product
    
    Parameters:
    arg1(string): The TF-IDF vector of a single product
    arg2(dataframe): All the tagged products we have
    arg3(array): All the sentence vectors of tagged products
    
    Return the TFIDF vector
    """
    
    import scipy
    
    product_tag = pd.DataFrame()
#    product_tag = pd.Series()
#    product_tag = []
    for attr in ['occasion', 'style', 'subcategory_bottom', 'subcategory_top']:
        similarities = []
        df = data1[data1['attribute_name']==attr]
        for index in df.index:
            item = product_lem_vector[index]
            similarity = 1 - scipy.spatial.distance.cosine(product,item)
            similarities.append(similarity)
        similarities = pd.DataFrame(similarities,index=df.index,columns=['score'])
        similarities = similarities[similarities.score<0.999999999]
        if attr in ['occasion','style']:
            tags = set()
            for index in similarities[similarities.score>=t1].index:
                tags.add(data1.loc[index,'new_attribute_value'])
            product_tag[attr] = [list(tags)]
#            for tag in tags:
#                product_tag.append([attr,tag])
        else:
            if similarities.score.max()<t2:
                product_tag[attr] = 'N/A'
#                product_tag.append([attr,'N/A'])
            else:
                index = similarities.idxmax()
                tag = data1.loc[index,'new_attribute_value'].values
                product_tag[attr] = tag
#                product_tag.append([attr,tag])
#    product_tag = pd.DataFrame(product_tag,columns=['attribute_name','attribute_value'])
    return product_tag

In [410]:
# Read the sentence vector
product_lem_vector = pd.read_csv('combine_lem_tfidf_score.csv')
product_lem_vector.drop(columns=['Unnamed: 0'],inplace=True)
product_lem_vector.head()
# Turn the panda series into a list of arrays
product_lem_vector = np.array(product_lem_vector)
product_lem_vector = list(product_lem_vector)
# Read data1
data1 = pd.read_csv('data1.csv')

In [412]:
# Read the full data
full_data = pd.read_csv('Full data.csv')
full_data = full_data.loc[:, ["name", "brand", "description", "details"]]
full_data.fillna("unknown", inplace = True)
full_data.head()

Unnamed: 0,name,brand,description,details
0,Original Fitness Sneakers,FILA,Vintage Fitness leather sneakers with logo pri...,Leather/synthetic upper\nLace-up closure\nText...
1,HAT,CHANEL,unknown,WOOL TWEED & FELT
2,Petit Oval Buckle Belt,Frame,A Timeless Leather Belt Crafted From Smooth Co...,unknown
3,Little Gir's & Girl's Ariana One-Piece UPF 50+...,Lilly Pulitzer Kids,Pretty ruffle sleeves and trim elevate essenti...,Scoopneck\nAdjustable straps\nFlutter sleeves\...
4,Baby Girl's Endearing Elephants Pima Cotton Co...,Kissy Kissy,Versatile convertible gown with elephant applique,V-neckline\nLong sleeves\nFront snap closure\n...


In [418]:
# Predict 500 sample records on full data
t1 = 0.6
t2 = 0.6

prediction_results = pd.DataFrame()
for i in range(500):
    prod = full_data.loc[i,:]
    description = prod['description']
    details = prod['details']
    brand = prod['brand']
    product_full_name = prod['name']
    product = regex_cleaning(description, details, brand, product_full_name, idf_score)
    temp = tagging_final(product,data1,product_lem_vector)
    prediction_results = pd.concat([prediction_results,temp],ignore_index=True)

In [427]:
full_data.shape

(42373, 8)

In [434]:
full_data = full_data[0:500].copy()
full_data["occasion"] = [[]]*len(full_data)
full_data["style"] = [[]]*len(full_data)
full_data["subcategory_bottom"] = [[]]*len(full_data)
full_data["subcategory_top"] = [[]]*len(full_data)

full_data["occasion"] = prediction_results["occasion"]
full_data["style"] = prediction_results["style"]
full_data["subcategory_bottom"] = prediction_results["subcategory_bottom"]
full_data["subcategory_top"] = prediction_results["subcategory_top"]

full_data.to_csv("sample_full_data_result.csv")

In [435]:
full_data.head()

Unnamed: 0,name,brand,description,details,occasion,style,subcategory_bottom,subcategory_top
0,Original Fitness Sneakers,FILA,Vintage Fitness leather sneakers with logo pri...,Leather/synthetic upper\nLace-up closure\nText...,[],[],,
1,HAT,CHANEL,unknown,WOOL TWEED & FELT,"[Weekend, daytonight, weekend, DaytoNight]","[Classic, classic, casual, Casual]",,
2,Petit Oval Buckle Belt,Frame,A Timeless Leather Belt Crafted From Smooth Co...,unknown,"[nightout, daytonight, work, DaytoNight, Weeke...","[glam, Casual, classic, Classic, casual, busin...",,
3,Little Gir's & Girl's Ariana One-Piece UPF 50+...,Lilly Pulitzer Kids,Pretty ruffle sleeves and trim elevate essenti...,Scoopneck\nAdjustable straps\nFlutter sleeves\...,[],[],,
4,Baby Girl's Endearing Elephants Pima Cotton Co...,Kissy Kissy,Versatile convertible gown with elephant applique,V-neckline\nLong sleeves\nFront snap closure\n...,[],[],,


In [448]:
#Input essential features
description = input("Please input description: ")
details = input("Please input details: ")
brand = input("Please input brand: ")
product_full_name = input("Please product full name: ")
idf_score = "idf.csv"

result = regex_cleaning(description, details, brand, product_full_name, idf_score)
data1 = pd.read_csv("data1.csv")
product_lem_vector = pd.read_csv('combine_lem_tfidf_score.csv')
product_lem_vector.drop(columns=['Unnamed: 0'],inplace=True)
product_lem_vector.head()
# Turn the panda series into a list of arrays
product_lem_vector = np.array(product_lem_vector)
product_lem_vector = list(product_lem_vector)

tagging_final(result, data1, product_lem_vector)

Please input description: unknown	
Please input details: WOOL TWEED & FELT
Please input brand: CHANEL	
Please product full name: HAT


Unnamed: 0,occasion,style,subcategory_bottom,subcategory_top
0,"[Weekend, daytonight, weekend, DaytoNight]","[Classic, classic, casual, Casual]",,
