In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt


import xgboost as xgb

from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
#from sklearn import pipeline, model_selection
from sklearn import pipeline, grid_search
#from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
#from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer

from collections import defaultdict

import re
import time
import math
import random

random.seed(2017)

In [53]:
loc = '%s'
#loc = '/home/ec2-user/data/hd/features/%s'

queries = pd.read_pickle(loc % 'FEATURES_WITH_TEXT_1.data')

idx_train = pd.read_pickle(loc % 'LABELS_TRAIN.df')
idx_test = pd.read_pickle(loc % 'LABELS_TEST.df')

In [54]:
IDX_TR, IDX_TE = train_test_split(idx_train.index, test_size=0.20, random_state=147)

IDX_TRAIN = idx_train.loc[IDX_TR]

IDX_TEST = idx_train.loc[IDX_TE]
IDX_TEST['orig_rel'] = IDX_TEST['relevance']
IDX_TEST['relevance'] = -1

idx_test['relevance'] = -1

known_labels = pd.concat([IDX_TRAIN, IDX_TEST, idx_test]).join(queries)
#known_labels = pd.concat([idx_train, idx_test]).join(queries)

In [194]:
has_digit = re.compile('([0-9]|units|xby)')

def calculate_word_pairs(a1, a2):
    
    word_matches = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda: 0)))
    word_matches_score_raw = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda: 0)))
    word_matches_score_offset = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda: 0)))
    word_counts = defaultdict(lambda : defaultdict(lambda : 0))
    word_counts_score = defaultdict(lambda : defaultdict(lambda : 0))
    
    def f(r):
        ID = r['id']
        query = r[a1]
        if not query:
            query = "NOWORDS"
        qs = [q for q in query.split() if not has_digit.search(q)]

        title = r[a2]
        if not title:
            title = "NOWORDS"
        ts = [t for t in title.split() if not has_digit.search(t)]

        score = r['relevance']
        for q in qs:
            c = 0
            c_score = 0
            for t in ts:
                word_matches[ID][q][t] += 1
                if score > 0:
                    word_matches_score_raw[ID][q][t] += score
                    word_matches_score_offset[ID][q][t] += (score - 2.38)
                    c_score += 1
                c += 1
            word_counts[ID][q] += c
            word_counts_score[ID][q] += c_score
    _ = known_labels.reset_index().apply(f, axis=1)
    
    return (word_matches, word_matches_score_raw, word_matches_score_offset, word_counts, word_counts_score)
    
word_matches, word_matches_score_raw, word_matches_score_offset, word_counts, word_counts_score = calculate_word_pairs('query', 'product_title')
word_matches2, word_matches_score_raw2, word_matches_score_offset2, word_counts2, word_counts_score2 = calculate_word_pairs('product_title', 'query')

In [207]:
def get_train_dict(given_word_matches):
    total_wm = defaultdict(lambda : defaultdict(lambda: 0))
    for ID, wm in given_word_matches.items():
        for q, tc in wm.items():
            for t, c in tc.items():
                total_wm[q][t] += c
    total_wm = { q: { t: c for t,c in tc.items() } for q,tc in total_wm.items()}
    return total_wm

def get_train_counts(given_word_count):
    total_wc = defaultdict(lambda : 0)
    for ID, wm in given_word_count.items():
        for q, c in wm.items():
            total_wc[q] += c
    total_wc = { t: c for t,c in total_wc.items()}
    return total_wc

train_word_matches = get_train_dict(word_matches)
train_word_matches_score_raw = get_train_dict(word_matches_score_raw)
train_word_matches_score_offset = get_train_dict(word_matches_score_offset)
train_word_counts = get_train_counts(word_counts)
train_word_counts_score = get_train_counts(word_counts_score)

train_word_matches2 = get_train_dict(word_matches2)
train_word_matches_score_raw2 = get_train_dict(word_matches_score_raw2)
train_word_matches_score_offset2 = get_train_dict(word_matches_score_offset2)
train_word_counts2 = get_train_counts(word_counts2)
train_word_counts_score2 = get_train_counts(word_counts_score2)


#sorted([(v, k) for k,v in train_word_matches_score_offset['pt'].items()], reverse=True)[0:12], sorted([(v, k) for k,v in train_word_matches_score_offset2['showerhead'].items()], reverse=True)[0:12]

In [289]:
def trimmed_dict(zad, d, zawc, wc, no_id, qs):
    
    #ad = { q: v.copy() for q, v in zad.items() }
    ad = {}
    awc = zawc.copy()
        
    ded = d.get(no_id, None)
    dewc = wc.get(no_id, None)
    
    for q in qs:
        adq = zad.get(q, None)
        if adq:
            ad[q] = zad[q].copy()
        else:
            ad[q] = {}
        
    if ded:
        for q, tc in ded.items():  
            for t, c in tc.items():
                ad[q][t] -= c
            awc[q] -= dewc[q]
            
    for q, tc in ad.items():
        for t, c in ad[q].items():
            denom = awc.get(q, 0) + 1.0
            a = ad[q][t]
            if a < 0.0000001:
                del ad[q][t]
            else:
                ad[q][t] = a*1.0/denom
        if not ad[q]:
            del ad[q]
    return ad

def get_features_no_id(r):
    ID = r['id']
    
    query = r['query']
    if not query:
        query = "NOWORDS"
    qs = [q for q in query.split() if not has_digit.search(q)]

    title = r['product_title']
    if not title:
        title = "NOWORDS"
    ts = [t for t in title.split() if not has_digit.search(t)]

    myd1 = trimmed_dict(train_word_matches, word_matches, train_word_counts, word_counts, ID, qs)
    myd2 = trimmed_dict(train_word_matches_score_raw, word_matches_score_raw, train_word_counts_score, word_counts_score, ID, qs)
    myd3 = trimmed_dict(train_word_matches_score_offset, word_matches_score_offset, train_word_counts_score, word_counts_score, ID, qs)
    myd4 = trimmed_dict(train_word_matches2, word_matches2, train_word_counts2, word_counts2, ID, ts)
    myd5 = trimmed_dict(train_word_matches_score_raw2, word_matches_score_raw2, train_word_counts_score2, word_counts_score2, ID, ts)
    myd6 = trimmed_dict(train_word_matches_score_offset2, word_matches_score_offset2, train_word_counts_score2, word_counts_score2, ID, ts)
    


    a1, a2, a3 = 0.0, 0.0, 0.0
    b1, b2, b3 = 0.0, 0.0, 0.0
    for q in qs:
        qt1 = myd1.get(q, None)
        qt2 = myd2.get(q, None)
        qt3 = myd3.get(q, None)
        for t in ts:
            if qt1:
                a1 += qt1.get(t, 0)
            if qt2:
                a2 += qt2.get(t, 0)
            if qt3:
                a3 += qt3.get(t, 0)
    for t in ts:
        tq1 = myd4.get(t, None)
        tq2 = myd5.get(t, None)
        tq3 = myd6.get(t, None)
        for q in qs:
            if tq1:
                b1 += tq1.get(q, 0)
            if tq2:
                b2 += tq2.get(q, 0)
            if tq3:
                b3 += tq3.get(q, 0)
    a1 /= (len(qs) + 1)
    a2 /= (len(qs) + 1)
    a3 /= (len(qs) + 1)
    b1 /= (len(ts) + 1)
    b2 /= (len(ts) + 1)
    b3 /= (len(ts) + 1)        
    return pd.Series([ID, a1, a2, a3, b1, b2, b3])

#myd1 = trimmed_dict(train_word_matches, word_matches, train_word_counts, word_counts, 214616)

In [290]:
#myd1 = trimmed_dict(train_word_matches, word_matches, train_word_counts, word_counts, 214616)
myd2 = trimmed_dict(train_word_matches_score_raw, word_matches_score_raw, train_word_counts_score, word_counts_score, 1, 'bracket'.split())

In [291]:
myd2

{'bracket': {u'adjust': 0.0016933139534883721,
  u'adjusta': 0.0016933139534883721,
  u'adjustable': 0.0014534883720930232,
  u'adler': 0.0016933139534883721,
  u'aframe': 0.002180232558139535,
  u'air': 0.0014534883720930232,
  u'aluminum': 0.003393895348837209,
  u'american': 0.0014534883720930232,
  u'americano': 0.0029069767441860465,
  u'angle': 0.015385174418604652,
  u'anti': 0.001940406976744186,
  u'antique': 0.001940406976744186,
  u'antitip': 0.001940406976744186,
  u'arc': 0.0009665697674418605,
  u'arms': 0.0007267441860465116,
  u'articulating': 0.0055741279069767445,
  u'ashville': 0.0024273255813953487,
  u'assembly': 0.0016933139534883721,
  u'attract': 0.0009665697674418605,
  u'bale': 0.0029069767441860465,
  u'banbury': 0.0055668604651162796,
  u'bands': 0.0012136627906976743,
  u'bar': 0.002180232558139535,
  u'barton': 0.0016933139534883721,
  u'basin': 0.0009665697674418605,
  u'bathroom': 0.00532703488372093,
  u'bauhaus': 0.002180232558139535,
  u'bay': 0.00194

In [273]:
queries.loc[1]

query                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               90 unitsdegrees bracket
product_uid                                                                                                                                                 

In [268]:
i = 1
word_counts_score[i], word_matches_score_raw[i]

(defaultdict(<function __main__.<lambda>>, {u'bracket': 0}),
 defaultdict(<function __main__.<lambda>>, {}))

In [270]:
word_matches_score_raw[1]


defaultdict(<function __main__.<lambda>>, {})

In [282]:
train_word_matches_score_raw2['bracket']

{u'ac': 2.0,
 u'aluminum': 4.0,
 u'anchor': 5.34,
 u'angle': 11.0,
 u'anodized': 4.0,
 u'anti': 2.67,
 u'around': 2.33,
 u'automatic': 2.33,
 u'awning': 8.01,
 u'banister': 5.67,
 u'bar': 2.67,
 u'barn': 3.0,
 u'bars': 3.33,
 u'beams': 12.0,
 u'bearings': 2.33,
 'bed': 2.33,
 u'bi': 14.34,
 u'bifold': 14.34,
 u'black': 14.33,
 u'blind': 11.67,
 'blinds': 4.34,
 u'board': 6.0,
 u'boltless': 16.33,
 u'box': 31.340000000000003,
 u'boxes': 6.99,
 u'bracket': 152.68,
 u'brackets': 164.35999999999996,
 u'brushed': 2.33,
 u'burning': 2.67,
 u'cabinet': 2.0,
 u'cedar': 5.34,
 u'ceiling': 28.33,
 u'cemetery': 3.33,
 u'center': 5.34,
 u'channel': 2.0,
 'chicken': 2.0,
 u'clamps': 12.67,
 u'closer': 4.67,
 u'closet': 36.35,
 'closetmaid': 25.33,
 u'clothes': 10.34,
 u'commercial': 2.67,
 u'composit': 5.67,
 u'composite': 6.67,
 u'copper': 2.33,
 u'corner': 28.989999999999995,
 u'countertop': 7.66,
 u'covers': 3.33,
 'curtain': 4.66,
 'curtains': 3.34,
 u'deck': 9.67,
 u'decks': 6.33,
 u'deere': 1

In [287]:
queries[queries['product_title'].str.contains('beechmont')]

Unnamed: 0_level_0,query,product_uid,product_title,product_description,brand,attrs,brand_none,brand_unbranded,brand_hampton,brand_kohler,...,c34,c35,c36,c37,c38,c39,c40,c41,c42,c43
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
46803,3 unitsinches veranda fencing,111630,veranda beechmont standard duty standardduty 4 unitsfeet xby 3 unitsfeet black aluminum pre assembled preassembled fence gate,veranda black aluminum fence pre assembled preassembled gates come two widths 36 unitsinches 48 unitsinches pre assembled preassembled gates include hinges fence gate look great pool surround requires pool latch 73049459 sold separately check local pool codes goes fence kit 73017993made powder coate coated aluminum requires heavy duty gate post installation transferrable limite limited lifetime warranty powder coate coated aluminum rust corrode 5d8 unitsinches square pickets transferable limite limited lifetime warranty,none,,1.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0,0,15,0,0
206935,veranda 4 unitsfeet fence,194492,veranda 4 unitsfeet xby 6 unitsfeet beechmont black aluminum fence kit,veranda unassembled aluminum fence kits economical solution classic look wrought iron fence our patente patented pro locktm locking strips lock panels inside means screws used panel assembly unlike metal fence solutions coordinating gates come two widths 36 unitsinches 48 unitsinches gates posts sold separately coordinating posts veranda 2 unitsinches xby 2 unitsinches xby 5 7d8 unitsinches 4 xby 6 posts 73008710 73008711 73008712 73008713coordinating gates 3 unitsfeet 73009452 4 unitsfeet 73009453no screws used panel assembly each fence panel assembles minutes made powder coate coated aluminum transferrable limite limited lifetime warranty,veranda,bullet 05 made powder coate coated aluminum number rails panels 2 gauge 0 fencing product type ornamental metal fence panels bullet 06 transferrable limite limited lifetime warranty commercial residential use commercial residential actual panel thickness 625 nominal panel width ft 6 fence application decorative bullet 03 screws used panel assembly bullet 02 coordinating gates 3 unitsfeet 73009452 4 unitsfeet 73009453 fencing style ornamental bullet 04 each fence panel assembles minutes color family black structure type ornamental only actual panel width 72p625 hardware included color finish black cap top included yes nominal panel height ft 4 material metal mfg brand name veranda panel assembly unassembled panel pool approved yes bullet 01 coordinating posts veranda 2 unitsinches xby 2 unitsinches xby 5 7d8 unitsinches 4 xby 6 posts 73008710 73008711 73008712 73008713 actual panel height 47 product weight lb 10p5,0.0,0.0,0.0,0.0,...,0,0,1,0.25,7,1,7,15,0,0
209611,5 unitsinches 1d2 2 unitsfeet,196651,veranda beechmont 2 1d2 unitsinches xby 2 1d2 unitsinches xby 5 7d8 unitsfeet pewter heavy duty heavyduty aluminum fence line post,veranda aluminum fence posts pre punched prepunched except blank posts allow fast easy assembly line posts pre punched prepunched opposite sides join fence panels straight line visible screws after assembly post top hardware included each post cutting special tools required assembly made powder coate coated aluminum resist rust corrosion 1 post post top screws per package fence panel gate sold separately transferable limite limited lifetime warranty if mounting fence wood concrete surface use 2 1d2 unitsinches surface mount 73003962,none,,1.0,0.0,0.0,0.0,...,0,0,0,0.0,0,0,0,15,0,0


In [288]:
word_matches_score_raw2[46803]

defaultdict(<function __main__.<lambda>>, {})

In [292]:
woqta = known_labels.reset_index().iloc[0:1200].apply(get_features_no_id, axis=1)

In [230]:
#woqta = known_labels.reset_index().iloc[0:100].apply(get_features_no_id, axis=1)
woqta = known_labels.reset_index().apply(get_features_no_id, axis=1)
woqta.columns = ['id', 'woqt1', 'woqt2', 'woqt3', 'woqt4', 'woqt5', 'woqt6']
woqta['id'] = woqta['id'].astype(int)
woqta.set_index('id', inplace=True)

WOQTA_TRAIN = idx_train.join(woqta)
WOQTA_TEST = idx_test.join(woqta)

WOQTA_TRAIN.to_pickle('WOQTA_TRAIN')
WOQTA_TEST.to_pickle('WOQTA_TEST')

In [5]:
a = np.load(loc % 'train_data.npy')
b = np.load(loc % 'test_data.npy')
dtrain = xgb.DMatrix("train.buffer")
dtest = xgb.DMatrix("test.buffer")

X_train, X_test, y_train, y_test = train_test_split(a, dtrain.get_label(), test_size=0.20, random_state=147) # 0.20, 147

In [None]:
#woqta = known_labels.reset_index().iloc[0:100].apply(get_features_no_id, axis=1)
woqta = known_labels.reset_index().apply(get_features_no_id, axis=1)
woqta.columns = ['id', 'woqt1', 'woqt2', 'woqt3', 'woqt4', 'woqt5', 'woqt6']
woqta['id'] = woqta['id'].astype(int)
woqta.set_index('id', inplace=True)

WOQTA_TRAIN = idx_train.join(woqta)
WOQTA_TEST = idx_test.join(woqta)

WOQTA_TRAIN.to_pickle('WOQTA_ALL_TRAIN')
WOQTA_TEST.to_pickle('WOQTA_ALL_TEST')

In [263]:
IDX_TRAIN.relevance.mean()

2.3806877288913637