In [220]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

In [183]:
products = pd.read_excel('Behold+product+data+04262021.xlsx', encoding = 'latin1') 
additional_tags = pd.read_csv('usc_additional_tags USC.csv', encoding = 'latin1')
brands = pd.read_csv('behold_brands USC.csv', encoding = 'latin1')

In [211]:
tags = additional_tags.groupby(['product_id']).agg(' '.join)

In [203]:
df = pd.merge(products, tags, on = 'product_id', how = 'left')
df = df.fillna('Unknown').astype(str)

In [205]:
top_30 = df['brand'].value_counts().nlargest(30).keys()
df['brand_top30'] = df['brand'].apply(lambda x: x if x in top_30 else 'Other')

In [206]:
df['product_active'] = df['product_active'].apply(lambda x: 1 if True else 0)

In [207]:
df['text'] = df[['brand_category', 'name', 'details', 'description', 'attribute_value']].agg(' '.join, axis = 1)

In [208]:
df['text'] = df['text'].str.lower()

In [209]:
df.head()

Unnamed: 0,product_id,brand,brand_category,name,details,created_at,brand_canonical_url,description,brand_description,brand_name,product_active,product_color_id,attribute_name,attribute_value,brand_top30,text
0,01EX0PN4J9WRNZH5F93YEX6QAF,Two,Unknown,Khadi Stripe Shirt-our signature shirt,Unknown,2021-01-27 01:17:19.305 UTC,https://two-nyc.myshopify.com/products/white-k...,Our signature khadi shirt\navailable in black ...,Our signature khadi shirt\n\navailable in blac...,Khadi Stripe Shirt-our signature shirt,1,Unknown,Unknown,Unknown,Other,unknown khadi stripe shirt-our signature shirt...
1,01F0C4SKZV6YXS3265JMC39NXW,Collina Strada,Unknown,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,Unknown,2021-03-09 18:43:10.457 UTC,https://collina-strada-2.myshopify.com/product...,Mid-length dress with ruffles and adjustable s...,Mid-length dress with ruffles and adjustable s...,RUFFLE MARKET DRESS LOOPY PINK SISTINE TOMATO,1,Unknown,Unknown,Unknown,Collina Strada,unknown ruffle market dress loopy pink sistine...
2,01EY4Y1BW8VZW51BWG5VZY82XW,Cariuma,Unknown,IBI Slip On Raw Red Knit Sneaker Women,Unknown,2021-02-10 02:58:59.591 UTC,https://cariuma.myshopify.com/products/ibi-sli...,IBI Slip On Raw Red Knit Sneaker Women,IBI Slip On Raw Red Knit Sneaker Women,IBI Slip On Raw Red Knit Sneaker Women,1,Unknown,Unknown,Unknown,Other,unknown ibi slip on raw red knit sneaker women...
3,01EY50E27A0P5V6KCW01XPDB43,Cariuma,Unknown,IBI Slip On Black Knit Sneaker Women,Unknown,2021-02-10 03:40:52.842 UTC,https://cariuma.myshopify.com/products/ibi-sli...,IBI Slip On Black Knit Sneaker Women,IBI Slip On Black Knit Sneaker Women,IBI Slip On Black Knit Sneaker Women,1,Unknown,Unknown,Unknown,Other,unknown ibi slip on black knit sneaker women u...
4,01EY6DWHC2W5HPNEGXKEJ4A1CX,Cariuma,Unknown,CATIBA PRO Skate Black Suede and Canvas Contra...,Unknown,2021-02-10 16:55:13.024 UTC,https://cariuma.myshopify.com/products/catiba-...,Unknown,Unknown,CATIBA PRO Skate Black Suede and Canvas Contra...,1,Unknown,Unknown,Unknown,Other,unknown catiba pro skate black suede and canva...


In [212]:
lemmatizer = WordNetLemmatizer()
# function to convert nltk tag to wordnet tag
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lemmatize_sentence(sentence):
    #tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    #tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged)
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            #if there is no available tag, append the token as is
            lemmatized_sentence.append(word)
        else:        
            #else use the tag to lemmatize the token
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [213]:
def text_cleaning(x):
    words = x.split()
    new_words = []
    for word in words:
        if word in stopwords.words('english') + ['unknown']:
            continue
        new_words.append(word)
    cleaned_text = " ".join(new_words)
    lemmatized_text = lemmatize_sentence(cleaned_text)
    return lemmatized_text

In [214]:
df['cleaned_text'] = df['text'].apply(text_cleaning)

In [216]:
df.to_excel('cleaned.xlsx', index = False)

In [168]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold,StratifiedKFold, cross_val_score, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

In [217]:
idf_vectorizer = TfidfVectorizer(ngram_range=(1,2),
                                 max_features=1000,
                                 min_df=5)
X = idf_vectorizer.fit_transform(df['text'].astype(str))
y = df['brand_top30']

In [222]:
kfolds_classification = StratifiedKFold(n_splits = 5, random_state = 0, shuffle = True) 
xgb_classification = xgb.XGBClassifier()
xgb_accuracy_cv = cross_val_score(xgb_classification, X, y, cv = kfolds_classification)
print("XGBoost Classification: \n")
print("Classification error of 10-folds: ",1-xgb_accuracy_cv)
print("Mean classification error:",1-np.mean(xgb_accuracy_cv))



XGBoost Classification: 

Classification error of 10-folds:  [0.04914025 0.04889577 0.04840681 0.05068862 0.04873278]
Mean classification error: 0.04917284654877352
