In [1]:
import file_path as fp
import pandas as pd
import numpy as np
import random
import tensorflow as tf
import math
import collections
import cv2

In [2]:
input_data = pd.read_csv(fp.csv_folder+"fashion_data_info_train_competition.csv")
validation_data = pd.read_csv(fp.csv_folder+"fashion_data_info_val_competition.csv")

for col in input_data.columns.values:
    print(col, ": unique values =", len(input_data[col].unique()), ", non-empty rows =", len(input_data.dropna(subset=[col])))

# convert floating points class types to integer. Empty class type is filled with -1
input_data[input_data.columns.values[3:]] = input_data[input_data.columns.values[3:]].fillna(-1.0).astype(int)

itemid : unique values = 275142 , non-empty rows = 275142
title : unique values = 241693 , non-empty rows = 275142
image_path : unique values = 275142 , non-empty rows = 275142
Pattern : unique values = 21 , non-empty rows = 164078
Collar Type : unique values = 17 , non-empty rows = 113638
Fashion Trend : unique values = 12 , non-empty rows = 147084
Clothing Material : unique values = 20 , non-empty rows = 175499
Sleeves : unique values = 5 , non-empty rows = 177903


In [5]:
# add image features to validation data
resnet_features = tf.keras.applications.ResNet50(include_top=False, weights='imagenet', pooling="avg")

def get_resnet_features(image_path):
  try:
    im = cv2.resize(cv2.imread(fp.image_base_folder+image_path), (224, 224)).astype(np.float32)
    # standardization: remove mean of ISLVRC2012 dataset
    im[:,:,0] -= 103.939
    im[:,:,1] -= 116.779
    im[:,:,2] -= 123.68
    # Insert a new dimension for the batch_size
    im = np.expand_dims(im, axis=0)
    return resnet_features.predict(im)[0]
  except:
    return None

validation_data["image_vector"] = validation_data.apply(lambda x: get_resnet_features(x.image_path), axis=1)
input_data["image_vector"] = input_data.apply(lambda x: get_resnet_features(x.image_path), axis=1)

In [26]:
# add image features to validation data
resnet_features = tf.keras.applications.ResNet50(include_top=False, weights='imagenet', pooling="avg")

def get_resnet_features(image_path, vec):
  try:
#     if image_path[-4:] != ".jpg":
#       image_path += ".jpg"
    if vec is None:
      image_path += ".jpg"
    else:
      return vec
      
    im = cv2.resize(cv2.imread(fp.image_base_folder+image_path), (224, 224)).astype(np.float32)
    # standardization: remove mean of ISLVRC2012 dataset
    im[:,:,0] -= 103.939
    im[:,:,1] -= 116.779
    im[:,:,2] -= 123.68
    # Insert a new dimension for the batch_size
    im = np.expand_dims(im, axis=0)
    return resnet_features.predict(im)[0]
  except:
    return None 
  
input_data["image_vector"] = input_data.apply(l  ambda x: get_resnet_features(x.image_path, x.image_vector), axis=1)

In [27]:
input_data.to_pickle(fp.csv_folder+"fashion_training_with_resnet50_vector_and_word2vec.pickle")

In [37]:
validation_data.to_pickle(fp.csv_folder+"fashion_validation_with_resnet50_vector_and_word2vec.pickle")
input_data.to_pickle(fp.csv_folder+"fashion_training_with_resnet50_vector_and_word2vec.pickle")

In [2]:
input_data = pd.read_pickle(fp.csv_folder+"fashion_training_with_resnet50_vector_and_word2vec.pickle")
validation_data = pd.read_pickle(fp.csv_folder+"fashion_validation_with_resnet50_vector_and_word2vec.pickle")

In [29]:
def is_phone_number(string):
  if len(string) <= 3:
    return False
  digit_count = 0
  for char in string:
    if char.isdigit():
      digit_count += 1
      
  if digit_count > 3:
    return True
  
  return False

In [30]:
vocab_sentence = list()
sentence_id = 0

for line in np.concatenate([input_data.title.values, validation_data.title.values]):
  tokens = line.split()
  for token in tokens:
    # remove telephone numbers
    if len(token) == 1 and not token.isdigit():
      continue
    if is_phone_number(token):
      continue
    if token == "whatsapp" or token == "wa":
      continue
      
    vocab_sentence.append((token, sentence_id))
  
  sentence_id += 1

vocabulary, sentence_id_map = list(zip(*vocab_sentence))
  
print('Data size', len(vocabulary))

Data size 3826002


In [31]:
import collections

vocabulary_size = 400

def build_dataset(words, n_words):
  """Process raw inputs into a dataset."""
  count = [['UNK', -1]]
  count.extend(collections.Counter(words).most_common(n_words - 1))
  dictionary = dict()
  for word, _ in count:
    dictionary[word] = len(dictionary)
  data = list()
  unk_count = 0
  for word in words:
    index = dictionary.get(word, 0)
    if index == 0:  # dictionary['UNK']
      unk_count += 1
    data.append(index)
  count[0][1] = unk_count
  reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
  return data, count, dictionary, reversed_dictionary
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)
# del vocabulary  # Hint to reduce memory.
print('Most common words', count[1:6])
print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])
print("length of the dictionary: ", len(reverse_dictionary), "should be equal to", vocabulary_size)
print("least common words", count[-5:])

Most common words [('lengan', 180189), ('wanita', 141374), ('dress', 122219), ('neck', 100857), ('untuk', 97713)]
Sample data [88, 46, 3, 3, 46, 37, 78, 114, 75, 169] ['retro', 'floral', 'dress', 'dress', 'floral', 'sifon', 'korean', 'white', 'chiffon', 'collar']
length of the dictionary:  400 should be equal to 400
least common words [('belahan', 834), ('bralette', 824), ('tembus', 822), ('two', 814), ('belakang', 810)]


In [32]:
# calculate Term Frequency - Inverse Term Frequency for most common words
known_word_set = set([word for word, _ in count[1:]])

In [33]:
data_index = 0
# new
# Step 3: Function to generate a training batch for the skip-gram model.
def generate_batch(batch_size, num_skips, skip_window):
  global data_index
  assert batch_size % num_skips == 0
  assert num_skips <= 2 * skip_window
  batch = np.ndarray(shape=(batch_size), dtype=np.int32)
  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)
  span = 2 * skip_window + 1  # [ skip_window target skip_window ]
  buffer = collections.deque(maxlen=span)  # pylint: disable=redefined-builtin
  if data_index + span > len(data):
      data_index = 0
  buffer.extend(data[data_index:data_index + span])
  data_index += span
  for i in range(batch_size // num_skips):
    context_words = [w for w in range(span) if w != skip_window]
    words_to_use = random.sample(context_words, num_skips)
    for j, context_word in enumerate(words_to_use):
      if sentence_id_map[data_index-span+skip_window] == sentence_id_map[data_index-span+context_word]:
        batch[i * num_skips + j] = buffer[skip_window]
        labels[i * num_skips + j, 0] = buffer[context_word]
      else:
        batch[i * num_skips + j] = 0
        labels[i * num_skips + j, 0] = 0
    if data_index == len(data):
      buffer.extend(data[0:span])
      data_index = span
    else:
      buffer.append(data[data_index])
      data_index += 1
  # Backtrack a little bit to avoid skipping words in the end of a batch
  data_index = (data_index + len(data) - span) % len(data)
  return batch, labels

batch, labels = generate_batch(batch_size=20, num_skips=2, skip_window=2)
print(input_data.title.head(2).values)
for i in range(20):
  print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]])

['retro floral dress' 'dress floral sifon']
3 dress -> 46 floral
0 UNK -> 0 UNK
3 dress -> 37 sifon
3 dress -> 46 floral
0 UNK -> 0 UNK
46 floral -> 3 dress
37 sifon -> 46 floral
0 UNK -> 0 UNK
0 UNK -> 0 UNK
78 korean -> 114 white
0 UNK -> 0 UNK
114 white -> 169 collar
75 chiffon -> 114 white
75 chiffon -> 78 korean
0 UNK -> 0 UNK
169 collar -> 75 chiffon
3 dress -> 169 collar
0 UNK -> 0 UNK
18 women -> 0 UNK
0 UNK -> 0 UNK


In [34]:
# Step 4: Build and train a skip-gram model.

batch_size = 128
embedding_size = 64  # Dimension of the embedding vector.
skip_window = 1  # How many words to consider left and right.
num_skips = 2  # How many times to reuse an input to generate a label.
num_sampled = 64  # Number of negative examples to sample.

# We pick a random validation set to sample nearest neighbors. Here we limit the
# validation samples to the words that have a low numeric ID, which by
# construction are also the most frequent. These 3 variables are used only for
# displaying model accuracy, they don't affect calculation.
valid_size = 16  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)

graph = tf.Graph()

with graph.as_default():

  # Input data.
  with tf.name_scope('inputs'):
    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

    # Look up embeddings for inputs.
  with tf.name_scope('embeddings'):
    embeddings = tf.Variable(
        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
    embed = tf.nn.embedding_lookup(embeddings, train_inputs)

# Construct the variables for the NCE loss
  with tf.name_scope('weights'):
    nce_weights = tf.Variable(
        tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))
    )
  with tf.name_scope('biases'):
    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

  # Compute the average NCE loss for the batch.
  # tf.nce_loss automatically draws a new sample of the negative labels each
  # time we evaluate the loss.
  # Explanation of the meaning of NCE loss:
  #   http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/
  with tf.name_scope('loss'):
    loss = tf.reduce_mean(
        tf.nn.nce_loss(
            weights=nce_weights,
            biases=nce_biases,
            labels=train_labels,
            inputs=embed,
            num_sampled=num_sampled,
            num_classes=vocabulary_size))

  # Add the loss value as a scalar to summary.
  tf.summary.scalar('loss', loss)

  # Construct the SGD optimizer
  with tf.name_scope('optimizer'):
    optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)

  # Compute the cosine similarity between minibatch examples and all embeddings.
  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True))
  normalized_embeddings = embeddings / norm
  valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                            valid_dataset)
  similarity = tf.matmul(
      valid_embeddings, normalized_embeddings, transpose_b=True)

  # Merge all summaries.
  merged = tf.summary.merge_all()

  # Add variable initializer.
  init = tf.global_variables_initializer()

  # Create a saver.
#   saver = tf.train.Saver()

In [35]:
# Step 5: Begin training.
data_index = 0

num_epochs = 10
num_steps = (len(data) * num_skips // batch_size + 1) * num_epochs
print("Steps to run:", num_steps)

with tf.Session(graph=graph) as session:
#   Open a writer to write summaries.
#   writer = tf.summary.FileWriter(fp.log_dir, session.graph)

  # We must initialize all variables before we use them.
  init.run()
  print('Initialized')

  average_loss = 0
  for step in range(num_steps):
    batch_inputs, batch_labels = generate_batch(batch_size, num_skips,
                                                skip_window)
    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}

    # Define metadata variable.
    run_metadata = tf.RunMetadata()

    # We perform one update step by evaluating the optimizer op (including it
    # in the list of returned values for session.run()
    # Also, evaluate the merged op to get all summaries from the returned "summary" variable.
    # Feed metadata variable to session for visualizing the graph in TensorBoard.
    _, summary, loss_val = session.run(
        [optimizer, merged, loss],
        feed_dict=feed_dict,
        run_metadata=run_metadata)
    average_loss += loss_val

    # Add returned summaries to writer in each step.
#     writer.add_summary(summary, step)
    # Add metadata to visualize the graph for the last run.
#     if step == (num_steps - 1):
#       writer.add_run_metadata(run_metadata, 'step%d' % step)

    if step % 2000 == 0:
      if step > 0:
        average_loss /= 2000
      # The average loss is an estimate of the loss over the last 2000 batches.
      print('Average loss at step', step, ':', average_loss)
      average_loss = 0

    # Note that this is expensive (~20% slowdown if computed every 500 steps)
#     if step % 10000 == 0:
#       sim = similarity.eval()
#       for i in range(valid_size):
#         valid_word = reverse_dictionary[valid_examples[i]]
#         top_k = 8  # number of nearest neighbors
#         nearest = (-sim[i, :]).argsort()[1:top_k + 1]
#         log_str = 'Nearest to %s: %s' % (valid_word, ", ".join([reverse_dictionary[k] for k in nearest]))
#         print(log_str)
  
  sim = similarity.eval()
  for i in range(valid_size):
    valid_word = reverse_dictionary[valid_examples[i]]
    top_k = 8  # number of nearest neighbors
    nearest = (-sim[i, :]).argsort()[1:top_k + 1]
    log_str = 'Nearest to %s: %s' % (valid_word, ", ".join([reverse_dictionary[k] for k in nearest]))
    print(log_str)
  final_embeddings = normalized_embeddings.eval()

  # Write corresponding labels for the embeddings.
#   with open(fp.log_dir + '/metadata.tsv', 'w') as f:
#     for i in range(vocabulary_size):
#       f.write(reverse_dictionary[i] + '\n')

  # Save the model for checkpoints.
#   saver.save(session, os.path.join(fp.log_dir, 'model.ckpt'))

  # Create a configuration for visualizing embeddings with the labels in TensorBoard.
#   config = projector.ProjectorConfig()
#   embedding_conf = config.embeddings.add()
#   embedding_conf.tensor_name = embeddings.name
#   embedding_conf.metadata_path = os.path.join(fp.log_dir, 'metadata.tsv')
#   projector.visualize_embeddings(writer, config)

# writer.close()

Steps to run: 597820
Initialized
Average loss at step 0 : 118.23272705078125
Average loss at step 2000 : 41.563430080890654
Average loss at step 4000 : 12.507865993499756
Average loss at step 6000 : 8.232627920150756
Average loss at step 8000 : 6.63601957499981
Average loss at step 10000 : 5.828361275911331
Average loss at step 12000 : 5.299916382312775
Average loss at step 14000 : 5.003721959590912
Average loss at step 16000 : 4.778860801815987
Average loss at step 18000 : 4.63048234963417
Average loss at step 20000 : 4.508250107765198
Average loss at step 22000 : 4.586850935339927
Average loss at step 24000 : 4.829594003200531
Average loss at step 26000 : 4.500870619297028
Average loss at step 28000 : 4.374320547699928
Average loss at step 30000 : 4.273100596547127
Average loss at step 32000 : 4.201676884174347
Average loss at step 34000 : 4.162560703158379
Average loss at step 36000 : 4.129571531176567
Average loss at step 38000 : 4.097367225527764
Average loss at step 40000 : 4.070

Average loss at step 340000 : 3.693199078083038
Average loss at step 342000 : 3.6892893110513687
Average loss at step 344000 : 3.6819405633211137
Average loss at step 346000 : 3.687534215450287
Average loss at step 348000 : 3.5831584013700484
Average loss at step 350000 : 3.5655936866998674
Average loss at step 352000 : 3.5745895270109176
Average loss at step 354000 : 3.748481830239296
Average loss at step 356000 : 3.6810733872652053
Average loss at step 358000 : 3.656140588283539
Average loss at step 360000 : 3.7237858241796493
Average loss at step 362000 : 3.7513633850812913
Average loss at step 364000 : 3.735221441268921
Average loss at step 366000 : 3.7264431278705596
Average loss at step 368000 : 3.7278698773384096
Average loss at step 370000 : 3.7258115334510804
Average loss at step 372000 : 3.723586210131645
Average loss at step 374000 : 3.7223733518123625
Average loss at step 376000 : 3.7164123785495757
Average loss at step 378000 : 3.711483700990677
Average loss at step 380000

In [36]:
def get_avg_title_vector(itemid, title):
  tokens = title.split()
  total_score = 0.0
  total_vec = np.zeros(embedding_size)
  for token in tokens:
    if token not in known_word_set:
      continue
    word_vec = final_embeddings[dictionary[token]]
    tf_idf_score = 1.0
    total_score += tf_idf_score
    total_vec += word_vec * tf_idf_score
  
  if total_score == 0.0:
    return total_vec
  return total_vec / total_score

input_data["title_vector"] = input_data.apply(lambda x: get_avg_title_vector(x.itemid, x.title), axis=1)
validation_data["title_vector"] = validation_data.apply(lambda x: get_avg_title_vector(x.itemid, x.title), axis=1)

In [3]:
def map_at_k(guesses, correct, k):
  for i in range(k):
    if guesses[i] == correct:
      return 1 / (i + 1), i + 1
  return 0.0, 0

def proba_to_guesses(probs, classes, k):
  ret = list()
  
  for idx in np.argsort(probs)[-k:]:
    ret.append(classes[idx])
  
  ret.reverse()
  return ret

def score_model(proba_vector, ground_truth, classes, k):
  assert proba_vector.shape[1] == len(classes)
  assert proba_vector.shape[0] == ground_truth.shape[0]
  total_score = 0.0
  
  stat = [0] * (k + 1)
  
  for item_vec, correct in zip(proba_vector, ground_truth):
    result, idx = map_at_k(proba_to_guesses(item_vec, classes, k), correct, k)
    total_score += result
    stat[idx] += 1
  return total_score / proba_vector.shape[0], stat

In [4]:
from sklearn.decomposition import PCA

pca = PCA(n_components=64, random_state=12345)

reduced_image_vector = pca.fit_transform(np.stack(input_data.image_vector.values))

input_data["image_vector_64"] = np.vsplit(reduced_image_vector, indices_or_sections=reduced_image_vector.shape[0])

validation_reduced_image_vector = pca.transform(np.stack(validation_data.image_vector.values))

validation_data["image_vector_64"] = np.vsplit(validation_reduced_image_vector,
                                               indices_or_sections=validation_reduced_image_vector.shape[0])

In [5]:
#dry run
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

for y_col in input_data.columns.values[3:6]:
  train, test = train_test_split(input_data[input_data[y_col] != -1], test_size=0.2, random_state=12345)
  
  if train[y_col].unique().shape[0] > 100:
    significant = set(train[[y_col, "itemid"]].groupby(y_col).agg("count")\
      .sort_values("itemid", ascending=False).head(100).index.values)
    train = train[train.apply(lambda x: x[y_col] in significant, axis=1)]
    
  clf = RandomForestClassifier(n_estimators=200, random_state=0, n_jobs=-1)
  clf.fit(np.concatenate(
    (np.stack(train.title_vector.values), np.stack(train.image_vector_64.values).reshape(-1, 64))
    , axis=1), train[y_col])
  
  print(y_col, score_model(
    clf.predict_proba(
        np.concatenate(
        (np.stack(test.title_vector.values), np.stack(test.image_vector_64.values).reshape(-1, 64))
        , axis=1)
      )
      , test[y_col].values, clf.classes_, 2))

Pattern (0.8795099951243296, [3005, 27913, 1898])
Collar Type (0.8339933122140092, [2063, 17245, 3420])
Fashion Trend (0.8938368970323283, [2104, 25275, 2038])


In [6]:
# actual run
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

with open(fp.csv_folder+"submission/fashion_no_header.csv", "w") as sub_file:
  for y_col in input_data.columns.values[3:]:
    try:
      train, test = input_data[input_data[y_col] != -1], validation_data
    except:
      # there are vector feature fields after y columns
      break

    if train[y_col].unique().shape[0] > 100:
      significant = set(train[[y_col, "itemid"]].groupby(y_col).agg("count")\
        .sort_values("itemid", ascending=False).head(100).index.values)
      train = train[train.apply(lambda x: x[y_col] in significant, axis=1)]
    
    print(y_col, ": training model...", end="")
    
    clf = RandomForestClassifier(n_estimators=200, random_state=0, n_jobs=-1)
    clf.fit(np.concatenate(
      (np.stack(train.title_vector.values), np.stack(train.image_vector_64.values).reshape(-1, 64))
      , axis=1), train[y_col])

    proba_vector = clf.predict_proba(
          np.concatenate(
          (np.stack(test.title_vector.values), np.stack(test.image_vector_64.values).reshape(-1, 64))
          , axis=1)
        )
    assert validation_data.itemid.values.shape[0] == proba_vector.shape[0]
    
    print("write to file...")

    for itemid, item_vec in zip(validation_data.itemid.values, proba_vector):
      guesses = proba_to_guesses(item_vec, clf.classes_, 2)
      sub_file.write(f"{itemid}_{y_col},{' '.join([str(g) for g in guesses])}\n")

Pattern : training model...write to file...
Collar Type : training model...write to file...
Fashion Trend : training model...write to file...
Clothing Material : training model...write to file...
Sleeves : training model...write to file...
