# Multilabel nap classification

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import json
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

#from xgboost import XGBClassifier
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

## Reading the data

### Load some unlabelled processed data

In [205]:
with open('../data/processed/clean_nap_products.json') as data_file:    
    nap = json.load(data_file)

In [206]:
nap[0]

{'brand_name': 'burberry',
 'brand_original_name': 'Burberry',
 'id': '5b013e8c4500d65c4d2fa40d',
 'nap_id': '987293',
 'product_category': 'Clothing / Skirts / Midi',
 'product_description': "Burberry's September '17 collection debuted in the same east London venue as Christopher Bailey's 'Here We Are' exhibit - a collection of curated photographs detailing life in 20th Century England. In the same midi shape popular in the '30s and '40s, this polka-dot skirt is pintucked and pleated along the slit sides to accentuate the fluid silk fabric. Balance its ladylike feel with chunky boots or sneakers.",
 'product_name': 'Pintucked polka-dot silk midi skirt'}

In [207]:
len(nap)

9612

In [208]:
unlabelled_data = [p['product_description'] for p in nap]

unlabelled_data = [p for p in unlabelled_data if type(p) is str]
unlabelled_data = [p.lower() for p in unlabelled_data]

In [209]:
unlabelled_data[0]

"burberry's september '17 collection debuted in the same east london venue as christopher bailey's 'here we are' exhibit - a collection of curated photographs detailing life in 20th century england. in the same midi shape popular in the '30s and '40s, this polka-dot skirt is pintucked and pleated along the slit sides to accentuate the fluid silk fabric. balance its ladylike feel with chunky boots or sneakers."

###### in total  we have 9612 unlabelled data

### Load some labelled data

In [211]:
df = pd.read_hdf('../data/df_nap_labelled_updated.h5', 'table')

In [212]:
df.drop_duplicates('product_id',inplace=True)

In [213]:
df.shape

(1468, 6)

In [214]:
df.head()

Unnamed: 0,product_id,designer_original_name,product_name,comb_desc,From,label
0,5b013e8c4500d65c4d2fa41b,Valentino,Printed wool and silk-blend shorts,print wool short cut slightly loose fit nonstr...,nap,[romantic]
1,5b013e8d4500d65c4d2fa425,Valentino,Striped camouflage-print cotton-gabardine jacket,striped jacket intend loose fit nonstretchy fa...,nap,[street]
2,5b013e8e4500d65c4d2fa43d,Valentino,Leather trench coat,leather trench coat design oversized fit cut w...,nap,"[classic, romantic]"
3,5b013e8e4500d65c4d2fa445,Burberry,Dawson cashmere turtleneck sweater,cashmere sweater design loose fit chunky knit ...,nap,"[classic, minimalistic]"
4,5b013e8e4500d65c4d2fa446,Alexander McQueen,Embellished pleated leather midi skirt,leather skirt design wear waist supple nonstre...,nap,[gothic]


In [234]:
'metallic striped wool and cashmere-blend sweater' in df['comb_desc']

False

###### in total we have 1468 data

## Multiclass classifcation (predict classes independently)

### create classes

In [215]:
# prepare a k hot df containing the labels
# this is convenient to train k independent binary classifiers
labels = df['label']

mlb = MultiLabelBinarizer()

labels_khot = mlb.fit_transform(labels)

y = pd.DataFrame(labels_khot)

y.columns = mlb.classes_

In [216]:
y.head()

Unnamed: 0,artsy,athleisure,bohemian,business,casual,chic,classic,edgy,exotic,glam,...,punk,rocker,romantic,sartorial,sexy,sophisticated,street,tailored,vintage,western
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### create train test split

In [114]:
indexes = list(range(1468))

In [121]:
train, test = train_test_split(indexes, train_size=0.8, random_state=2018)



In [146]:
X = df['product_name'].str.lower()

In [222]:
X = df['comb_desc']

In [223]:
X_train = X.iloc[train]
X_test = X.iloc[test]
y_train = y.iloc[train]
y_test = y.iloc[test]

### Pipeline

In [202]:
### Construct some pipelines
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words='english')),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

pipe_lr_tfidf = Pipeline([('tf', TfidfVectorizer()), ('clf', OneVsRestClassifier(LogisticRegression(random_state=42)))])

pipe_lr = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                    ('clf', OneVsRestClassifier(LogisticRegression(random_state=42)))])

pipe_rf = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                    ('clf', RandomForestClassifier(random_state=42))])

pipe_svm = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('clf', OneVsRestClassifier(LinearSVC(random_state=42)))])

pipe_ada = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('ada', OneVsRestClassifier(AdaBoostClassifier()))])

pipe_xgb = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),
                     ('xgb', OneVsRestClassifier(XGBClassifier()))])

### Set grid search params for pipelines

In [247]:
param_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
param_range_fl = [1.0, 0.5, 0.1]

grid_params_lr = [{'clf__estimator__penalty': ['l1', 'l2'],
                    'clf__estimator__C': range(1,50,10)}] 

grid_params_rf = [{'clf__criterion': ['gini', 'entropy'],
                    'clf__min_samples_leaf': range(1,20,1),
                    'clf__max_depth': param_range,
                    'clf__min_samples_split': param_range[1:]}]

grid_params_svm = [{'clf__estimator__C': param_range}]

grid_params_ada = [{'ada__estimator__n_estimators': [50,100,200]}]

grid_params_xgb = [{'xgb__estimator__n_estimators': [50,100,200, 300, ],}]

### Create grid searches

In [248]:
# Construct grid searches
jobs = -1
gs_lr = GridSearchCV(estimator=pipe_lr,
        param_grid=grid_params_lr,
        scoring='f1_micro',
        cv=5) 

gs_rf = GridSearchCV(estimator=pipe_rf,
        param_grid=grid_params_rf,
        scoring='f1_micro',
        cv=5, 
        n_jobs=jobs)

gs_svm = GridSearchCV(estimator=pipe_svm,
        param_grid=grid_params_svm,
        scoring='f1_micro',
        cv=5)

gs_ada = GridSearchCV(
        estimator=pipe_ada,
        param_grid=grid_params_ada,
        scoring='f1_micro',
        cv=5, 
        n_jobs=jobs)

gs_xgb = GridSearchCV(estimator=pipe_xgb,
        param_grid=grid_params_xgb,
        scoring='f1_micro',
        cv=5, 
        n_jobs=jobs)

In [225]:
searches = [gs_lr_tf_idf, gs_lr, gs_rf, gs_svm, gs_ada, gs_xgb]

In [264]:
gs_lr.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
 ...te=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1))]),
       fit_params=None, iid=True, n_jobs=1,
       param_grid=[{'clf__estimator__penalty': ['l1', 'l2'], 'clf__estimator__C': range(1, 50, 10)}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='f1_micro', verbose=0)

In [None]:
prediction = gs_rf.predict(X_test)

In [None]:
print(classification_report(y_test, prediction))

In [None]:
gs_xgb = GridSearchCV(estimator=pipe_xgb,
        param_grid=grid_params_xgb,
        scoring='recall_micro',
        cv=10, 
        n_jobs=jobs)

### psuedo

In [291]:
def psuedo_fitter(model):
    model.fit(X_train, y_train)
    psuedo_predictions = model.predict(unlabelled_data)
    copy_train = X_train.copy()
    psuedo_train = copy_train.append(pd.Series(unlabelled_data))
    psuedo_preds = pd.DataFrame(psuedo_predictions)
    psuedo_preds.columns = mlb.classes_
    psuedo_y = pd.concat([y_train, psuedo_preds])
    model.fit(psuedo_train, psuedo_y)
    preds = model.predict(X_test)
    print(classification_report(y_test,preds))

In [292]:
psuedo_fitter(gs_lr)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


             precision    recall  f1-score   support

          0       0.22      0.11      0.14        46
          1       0.57      0.32      0.41        37
          2       0.00      0.00      0.00         6
          3       0.39      0.33      0.36        33
          4       0.44      0.45      0.44       100
          5       0.20      0.17      0.19        29
          6       0.38      0.38      0.38        26
          7       0.20      0.12      0.15        17
          8       0.46      0.33      0.39        18
          9       0.32      0.43      0.36        14
         10       0.00      0.00      0.00         3
         11       0.00      0.00      0.00         3
         12       0.17      0.21      0.18        34
         13       0.25      0.12      0.17         8
         14       0.50      0.27      0.35        11
         15       0.50      0.25      0.33         8
         16       0.51      0.50      0.50        60
         17       0.12      0.08      0.10   

  'precision', 'predicted', average, warn_for)


In [293]:
psuedo_fitter(gs_svm)

             precision    recall  f1-score   support

          0       0.33      0.17      0.23        46
          1       0.54      0.38      0.44        37
          2       0.50      0.17      0.25         6
          3       0.25      0.18      0.21        33
          4       0.44      0.43      0.43       100
          5       0.16      0.14      0.15        29
          6       0.38      0.42      0.40        26
          7       0.25      0.12      0.16        17
          8       0.64      0.39      0.48        18
          9       0.38      0.43      0.40        14
         10       0.00      0.00      0.00         3
         11       0.00      0.00      0.00         3
         12       0.15      0.18      0.16        34
         13       0.00      0.00      0.00         8
         14       1.00      0.18      0.31        11
         15       0.50      0.12      0.20         8
         16       0.54      0.47      0.50        60
         17       0.20      0.08      0.11   

  'precision', 'predicted', average, warn_for)


In [295]:
psuedo_fitter(gs_ada)

             precision    recall  f1-score   support

          0       0.28      0.11      0.16        46
          1       0.60      0.32      0.42        37
          2       0.00      0.00      0.00         6
          3       0.38      0.24      0.30        33
          4       0.51      0.36      0.42       100
          5       0.23      0.24      0.24        29
          6       0.30      0.35      0.32        26
          7       0.00      0.00      0.00        17
          8       0.41      0.39      0.40        18
          9       0.26      0.36      0.30        14
         10       0.00      0.00      0.00         3
         11       0.00      0.00      0.00         3
         12       0.21      0.24      0.22        34
         13       0.25      0.12      0.17         8
         14       0.40      0.18      0.25        11
         15       0.40      0.25      0.31         8
         16       0.47      0.47      0.47        60
         17       0.12      0.08      0.10   

  'precision', 'predicted', average, warn_for)


In [296]:
psuedo_fitter(gs_xgb)

             precision    recall  f1-score   support

          0       0.33      0.04      0.08        46
          1       0.73      0.22      0.33        37
          2       0.00      0.00      0.00         6
          3       0.50      0.21      0.30        33
          4       0.57      0.39      0.46       100
          5       0.00      0.00      0.00        29
          6       0.78      0.27      0.40        26
          7       0.00      0.00      0.00        17
          8       0.50      0.28      0.36        18
          9       0.57      0.29      0.38        14
         10       0.00      0.00      0.00         3
         11       0.00      0.00      0.00         3
         12       0.31      0.15      0.20        34
         13       0.00      0.00      0.00         8
         14       0.33      0.09      0.14        11
         15       1.00      0.25      0.40         8
         16       0.71      0.45      0.55        60
         17       0.00      0.00      0.00   

  'precision', 'predicted', average, warn_for)


In [286]:
psuedo_preds = pd.DataFrame(psuedo_predictions)
psuedo_preds.columns = mlb.classes_

In [290]:
pd.concat([y_train, psuedo_preds])

Unnamed: 0,artsy,athleisure,bohemian,business,casual,chic,classic,edgy,exotic,glam,...,punk,rocker,romantic,sartorial,sexy,sophisticated,street,tailored,vintage,western
1365,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1304,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
319,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1338,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
444,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
50,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
633,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1284,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
315,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
296,0,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [287]:
pd.concat([y_train, pd.DataFrame(psuedo_predictions)], axis=1)

Unnamed: 0,artsy,athleisure,bohemian,business,casual,chic,classic,edgy,exotic,glam,...,14,15,16,17,18,19,20,21,22,23
0,,,,,,,,,,,...,0,0,0,0,1,0,0,0,1,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
6,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
7,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
8,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0


In [272]:
X_train.append(pd.Series(unlabelled_data))

1625    silk crepe chine shirt cut loose fit lightweig...
1563    strip poplin cut slightly loose fit design adj...
319     crepe chine dress intend oversized fit cut wea...
1597    distressed denim short cut slim fit nonstretch...
451     asymmetric velvet dress cut slim fit slightly ...
50      metallic striped dress intend slightly loose f...
715     striped cotton tunic intend oversized fit cut ...
1543    strip design slightly loose fit stretchy fabri...
315     crepe chine dress cut slim fit nonstretchy fab...
296     asymmetric ruffled skirt asymmetric hem design...
416     skirt design wear natural waist slightly stret...
79      check skinny pant intend skinny fit length str...
1677    hooded dress design loose fit intend wear mult...
732     anglaise cotton cut fit waist loosely cut wais...
1388    rib hooded intend relaxed fit slightly stretch...
1633    patchwork print hooded dress design relaxed fi...
797     anglaise denim jacket cut loose fit nonstretch...
310     leathe

In [None]:
gs_lr.fit(X_train, y_train)

In [267]:
psuedo_predictions = gs_lr.predict(unlabelled_data)

In [268]:
psuedo_predictions

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [133]:
for category in mlb.classes_:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    NB_pipeline.fit(X_train, y_train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(y_test[category], prediction)))
    print('Test recall is {}'.format(recall_score(y_test[category], prediction)))
    print('Test precision is {}'.format(precision_score(y_test[category], prediction)))


... Processing artsy
Test accuracy is 0.8435374149659864
Test recall is 0.0
Test precision is 0.0
... Processing athleisure
Test accuracy is 0.8741496598639455
Test recall is 0.0
Test precision is 0.0
... Processing bohemian


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9795918367346939
Test recall is 0.0
Test precision is 0.0
... Processing business
Test accuracy is 0.8877551020408163
Test recall is 0.0
Test precision is 0.0
... Processing casual
Test accuracy is 0.6938775510204082
Test recall is 0.15
Test precision is 0.75
... Processing chic


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9013605442176871
Test recall is 0.0
Test precision is 0.0
... Processing classic
Test accuracy is 0.9115646258503401
Test recall is 0.0
Test precision is 0.0
... Processing edgy
Test accuracy is 0.9421768707482994
Test recall is 0.0
Test precision is 0.0
... Processing exotic


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9387755102040817
Test recall is 0.0
Test precision is 0.0
... Processing glam
Test accuracy is 0.9523809523809523
Test recall is 0.0
Test precision is 0.0
... Processing gothic
Test accuracy is 0.9897959183673469
Test recall is 0.0
Test precision is 0.0
... Processing hipster


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9897959183673469
Test recall is 0.0
Test precision is 0.0
... Processing minimalistic
Test accuracy is 0.8843537414965986
Test recall is 0.0
Test precision is 0.0
... Processing preppy
Test accuracy is 0.9727891156462585
Test recall is 0.0
Test precision is 0.0
... Processing punk


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Test accuracy is 0.9625850340136054
Test recall is 0.0
Test precision is 0.0
... Processing rocker
Test accuracy is 0.9727891156462585
Test recall is 0.0
Test precision is 0.0
... Processing romantic
Test accuracy is 0.7993197278911565


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Test recall is 0.016666666666666666
Test precision is 1.0
... Processing sartorial
Test accuracy is 0.95578231292517
Test recall is 0.0
Test precision is 0.0
... Processing sexy
Test accuracy is 0.9217687074829932
Test recall is 0.0
Test precision is 0.0
... Processing sophisticated


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Test accuracy is 0.8741496598639455
Test recall is 0.0
Test precision is 0.0
... Processing street
Test accuracy is 0.9217687074829932
Test recall is 0.0
Test precision is 0.0
... Processing tailored
Test accuracy is 0.9081632653061225
Test recall is 0.0
Test precision is 0.0
... Processing vintage
Test accuracy is 0.9013605442176871
Test recall is 0.0
Test precision is 0.0
... Processing western
Test accuracy is 0.9591836734693877
Test recall is 0.0
Test precision is 0.0


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [None]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('text', Pipeline([
            ('vectorizer', CountVectorizer(min_df=1,max_df=2)),
            ('tfidf', TfidfTransformer()),
        ])),
        ('length', Pipeline([
            ('count', FunctionTransformer(get_text_length, validate=False)),
        ]))
    ])),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(X_train, y_train)
predicted = classifier.predict(X_test)
predicted

### xgb

In [None]:
pipe_xgb = Pipeline([('xgb', OneVsRestClassifier(XGBClassifier()))])


### search for nap data

In [84]:
naps_data = pd.read_csv('../data/nap_labelled_updated.csv')

In [87]:
nap_h5 = pd.read_table('../data/nap_labelled.h5','table')

  """Entry point for launching an IPython kernel.
