In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt


import xgboost as xgb

from sklearn import linear_model
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor
#from sklearn import pipeline, model_selection
from sklearn import pipeline, grid_search
#from sklearn.feature_extraction import DictVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import TruncatedSVD
#from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, make_scorer

from collections import defaultdict

import re
import time
import math
import random

random.seed(2017)

In [2]:
loc = '/home/ec2-user/data/hd/features/%s'

queries = pd.read_pickle(loc % 'FEATURES_WITH_TEXT_1.data')

idx_train = pd.read_pickle(loc % 'LABELS_TRAIN.df')
idx_test = pd.read_pickle(loc % 'LABELS_TEST.df')

In [3]:
validation_idx = pd.read_csv(loc % 'valid_set.csv', index_col= 'id').index
train_idx = idx_train.index.difference(validation_idx)

train_idx.shape, validation_idx.shape

((53907,), (20160,))

In [4]:
#IDX_TR, IDX_TE = train_test_split(idx_train.index, test_size=0.20, random_state=117)
IDX_TR, IDX_TE = np.array(train_idx), np.array(validation_idx)

In [5]:
IDX_TRAIN = idx_train.loc[IDX_TR]

IDX_TEST = idx_train.loc[IDX_TE]
IDX_TEST['orig_rel'] = IDX_TEST['relevance']
IDX_TEST['relevance'] = -1

idx_test['relevance'] = -1

known_labels = pd.concat([IDX_TRAIN, IDX_TEST, idx_test]).join(queries)

In [7]:
known_labels.relevance.value_counts()

-1.00    186853
 3.00     15129
 2.33     11382
 2.67     11140
 2.00      8053
 1.67      4659
 1.33      2042
 1.00      1459
 2.50        11
 2.75         9
 1.75         8
 2.25         7
 1.50         5
 1.25         3
Name: relevance, dtype: int64

In [8]:
has_digit = re.compile('([0-9]|units|xby)')

def calculate_word_pairs(a1, a2):
    
    word_matches = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda: 0)))
    word_matches_score_raw = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda: 0)))
    word_matches_score_offset = defaultdict(lambda : defaultdict(lambda : defaultdict(lambda: 0)))
    word_counts = defaultdict(lambda : defaultdict(lambda : 0))
    word_counts_score = defaultdict(lambda : defaultdict(lambda : 0))
    
    def f(r):
        ID = r['id']
        query = r[a1]
        if not query:
            query = "NOWORDS"
        qs = [q for q in query.split() if not has_digit.search(q)]

        title = r[a2]
        if not title:
            title = "NOWORDS"
        ts = [t for t in title.split() if not has_digit.search(t)]

        score = r['relevance']
        for q in qs:
            c = 0
            c_score = 0
            for t in ts:
                word_matches[ID][q][t] += 1
                if score > 0:
                    word_matches_score_raw[ID][q][t] += score
                    word_matches_score_offset[ID][q][t] += (score - 2.38)
                    c_score += 1
                c += 1
            word_counts[ID][q] += c
            word_counts_score[ID][q] += c_score
    _ = known_labels.reset_index().apply(f, axis=1)
    
    return (word_matches, word_matches_score_raw, word_matches_score_offset, word_counts, word_counts_score)
    
word_matches, word_matches_score_raw, word_matches_score_offset, word_counts, word_counts_score = calculate_word_pairs('query', 'product_title')
word_matches2, word_matches_score_raw2, word_matches_score_offset2, word_counts2, word_counts_score2 = calculate_word_pairs('product_title', 'query')

In [9]:
def get_train_dict(given_word_matches):
    total_wm = defaultdict(lambda : defaultdict(lambda: 0))
    for ID, wm in given_word_matches.items():
        for q, tc in wm.items():
            for t, c in tc.items():
                total_wm[q][t] += c
    total_wm = { q: { t: c for t,c in tc.items() } for q,tc in total_wm.items()}
    return total_wm

def get_train_counts(given_word_count):
    total_wc = defaultdict(lambda : 0)
    for ID, wm in given_word_count.items():
        for q, c in wm.items():
            total_wc[q] += c
    total_wc = { t: c for t,c in total_wc.items()}
    return total_wc

train_word_matches = get_train_dict(word_matches)
train_word_matches_score_raw = get_train_dict(word_matches_score_raw)
train_word_matches_score_offset = get_train_dict(word_matches_score_offset)
train_word_counts = get_train_counts(word_counts)
train_word_counts_score = get_train_counts(word_counts_score)

train_word_matches2 = get_train_dict(word_matches2)
train_word_matches_score_raw2 = get_train_dict(word_matches_score_raw2)
train_word_matches_score_offset2 = get_train_dict(word_matches_score_offset2)
train_word_counts2 = get_train_counts(word_counts2)
train_word_counts_score2 = get_train_counts(word_counts_score2)


#sorted([(v, k) for k,v in train_word_matches_score_offset['pt'].items()], reverse=True)[0:12], 
#sorted([(v, k) for k,v in train_word_matches_score_offset2['showerhead'].items()], reverse=True)[0:12]

In [10]:
def trimmed_dict(zad, d, zawc, wc, no_id, qs):
    
    #ad = { q: v.copy() for q, v in zad.items() }
    ad = {}
    awc = zawc.copy()
        
    ded = d.get(no_id, None)
    dewc = wc.get(no_id, None)
    
    for q in qs:
        adq = zad.get(q, None)
        if adq:
            ad[q] = zad[q].copy()
        else:
            ad[q] = {}
        
    if ded:
        for q, tc in ded.items():  
            for t, c in tc.items():
                ad[q][t] -= c
            awc[q] -= dewc[q]
            
    for q, tc in ad.items():
        for t, c in ad[q].items():
            denom = awc.get(q, 0) + 1.0
            a = ad[q][t]
            if a < 0.0000001:
                del ad[q][t]
            else:
                ad[q][t] = a*1.0/denom
        if not ad[q]:
            del ad[q]
    return ad

def get_features_no_id(r):
    ID = r['id']
    
    query = r['query']
    if not query:
        query = "NOWORDS"
    qs = [q for q in query.split() if not has_digit.search(q)]

    title = r['product_title']
    if not title:
        title = "NOWORDS"
    ts = [t for t in title.split() if not has_digit.search(t)]

    myd1 = trimmed_dict(train_word_matches, word_matches, train_word_counts, word_counts, ID, qs)
    myd2 = trimmed_dict(train_word_matches_score_raw, word_matches_score_raw, train_word_counts_score, word_counts_score, ID, qs)
    myd3 = trimmed_dict(train_word_matches_score_offset, word_matches_score_offset, train_word_counts_score, word_counts_score, ID, qs)
    myd4 = trimmed_dict(train_word_matches2, word_matches2, train_word_counts2, word_counts2, ID, ts)
    myd5 = trimmed_dict(train_word_matches_score_raw2, word_matches_score_raw2, train_word_counts_score2, word_counts_score2, ID, ts)
    myd6 = trimmed_dict(train_word_matches_score_offset2, word_matches_score_offset2, train_word_counts_score2, word_counts_score2, ID, ts)
    


    a1, a2, a3 = 0.0, 0.0, 0.0
    b1, b2, b3 = 0.0, 0.0, 0.0
    for q in qs:
        qt1 = myd1.get(q, None)
        qt2 = myd2.get(q, None)
        qt3 = myd3.get(q, None)
        for t in ts:
            if qt1:
                a1 += qt1.get(t, 0)
            if qt2:
                a2 += qt2.get(t, 0)
            if qt3:
                a3 += qt3.get(t, 0)
    for t in ts:
        tq1 = myd4.get(t, None)
        tq2 = myd5.get(t, None)
        tq3 = myd6.get(t, None)
        for q in qs:
            if tq1:
                b1 += tq1.get(q, 0)
            if tq2:
                b2 += tq2.get(q, 0)
            if tq3:
                b3 += tq3.get(q, 0)
    a1 /= (len(qs) + 1)
    a2 /= (len(qs) + 1)
    a3 /= (len(qs) + 1)
    b1 /= (len(ts) + 1)
    b2 /= (len(ts) + 1)
    b3 /= (len(ts) + 1)        
    return pd.Series([ID, a1, a2, a3, b1, b2, b3])

#myd1 = trimmed_dict(train_word_matches, word_matches, train_word_counts, word_counts, 214616)

In [11]:
woqta = known_labels.reset_index().apply(get_features_no_id, axis=1)
woqta.columns = ['id', 'woqt1', 'woqt2', 'woqt3', 'woqt4', 'woqt5', 'woqt6']
woqta['id'] = woqta['id'].astype(int)
woqta.set_index('id', inplace=True)

WOQTA_TRAIN = idx_train.join(woqta)
WOQTA_TEST = idx_test.join(woqta)

WOQTA_TRAIN.to_pickle('WOQTAL_TRAIN_ALEX')
WOQTA_TEST.to_pickle('WOQTAL_TEST_ALEX')

In [15]:
WOQTA_TRAIN = IDX_TRAIN.join(woqta).reset_index()
WOQTA_TRAIN.to_csv('word_co_train_local.csv', index=False)

WOQTA_TEST = IDX_TEST.drop('relevance', axis=1).join(woqta).reset_index()
WOQTA_TEST.to_csv('word_co_validate_local.csv', index=False)

In [None]:
#started around 14:50
# finished ~16:50

In [14]:
IDX_TEST

Unnamed: 0_level_0,relevance,orig_rel
id,Unnamed: 1_level_1,Unnamed: 2_level_1
2,-1,3.00
16,-1,2.33
18,-1,3.00
35,-1,3.00
69,-1,1.00
88,-1,1.33
113,-1,2.00
117,-1,2.67
123,-1,3.00
136,-1,2.00
