In [1]:
import pandas as pd
import numpy as np
from chainer import Variable, optimizers
from sklearn.cross_validation import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
data = pd.read_csv('article_train_clean.csv',index_col=False)
print(data.shape)
data.rename(columns={'Unnamed: 0':'article_id','shared':'n_shares',}, inplace=True)
#Set article_id as index of the data frame (its values are used as labels for rows)
data.set_index('article_id',inplace=True)

(20005, 60)


In [3]:
q_col = pd.read_excel('query_column.xlsx')
q_col = q_col['q_id'].tolist()
print(len(q_col))
data.insert(1, 'q_id',q_col)

20005


In [4]:
#making sure data is loaded correctly
#qid_1(1-24)
#qid_2(25-46)--last n_shares = 1100
#qid_3(47-..)--first n_shares = 776
art_id = 46
print(data.loc[art_id , ['n_shares', 'q_id','c1'] ])
nq = data['q_id'].nunique()  # 360 total queries

n_shares    1100.000000
q_id           2.000000
c1             0.333333
Name: 46, dtype: float64


In [5]:
print(nq) #Number of avialble queries in the dataset
dq = data[data['q_id']==2] 
dq = dq.iloc[:,:]
print(dq) #printing second query for data demonstration

360
            n_shares  q_id        c1        c2        c3       c4        c5  \
article_id                                                                    
25             16900     2  0.571429  0.098655  0.000689  0.00096  0.000907   
26              9500     2  0.571429  0.089686  0.000704  0.00096  0.001105   
27              1600     2  0.428571  0.000000  0.000000  0.00000  0.000000   
28              1900     2  0.428571  0.041893  0.000871  0.00096  0.001301   
29              1100     2  0.428571  0.041185  0.000762  0.00096  0.001058   
30              1400     2  0.285714  0.025372  0.000857  0.00096  0.001037   
31              1200     2  0.571429  0.054402  0.000703  0.00096  0.000998   
32              1600     2  0.523810  0.071159  0.000539  0.00096  0.000712   
33              1700     2  0.428571  0.053576  0.000932  0.00096  0.001160   
34             15000     2  0.285714  0.073283  0.000749  0.00096  0.001097   
35              1300     2  0.238095  0.058768  

In [6]:
# preparing the testing and training data, here I use 80% of the queries for training and reserve 20% for testing
# then the training and testing labels are separated from the training and testing features, labels are
# basically the values we need to predict and features are the values of the feature vector that describe 
#each record
nt_tst = 0.2*nq # 20% of the number of queries are used as test data (72)
nt_train = 0.8*nq # 80% of the number of queires are for training (288)
X_test = data.query('1 <= q_id <= 72')
X_train = data.query('73 <= q_id <= 288')
y_test =  X_test.loc[:,['n_shares','q_id']]
X_test.drop(["n_shares"], axis = 1, inplace = True)
y_train =  X_train.loc[:,['n_shares','q_id']]
X_train.drop(["n_shares"], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [7]:
from chainer import Variable, optimizers
import seaborn as sns
import matplotlib.pyplot as plt
from chainer import Chain
import chainer.functions as F
import chainer.links as L

class MLP(Chain):
   
    #n_in: the number of features  in the feature vector  
    #n_hidden: number of hidden layers passed to the model
    def __init__(self, n_in, n_hidden):
        super(MLP, self).__init__(
            l1=L.Linear(n_in, n_hidden),
            l2=L.Linear(n_hidden, n_hidden),
            l3=L.Linear(n_hidden, 1)
        )
        

    def __call__(self, x):
        h1 = F.relu(self.l1(x))
        h2 = F.relu(self.l2(h1))
        return self.l3(h2)


class RankNet(Chain):

    def __init__(self, predictor):
        super(RankNet, self).__init__(predictor=predictor)

    # the model works in a pairwise approach where each pair of articles are grouped together and the model
    # assigns initla score for each of them using the random inital weights, these scores are then 
    # mapped to a sigmoid function (F.log(1 + F.exp(-s_diff))) which is a continous function suitable for 
    # optimisation used for measuring th eporbability of ranking an article above the other based on the scores
    # that the model predicted, then we penalize the output of this predicted probability from the actual
    # ranking probability (1 if A>B, 0 if A < B, 0.5 if A=B) that we know based on the labels of the training data
    # this deviation from the actual porbability is measured using a cross entropy cost function
    # More detailed information about the model are found in the attached report
    def __call__(self, x_i, x_j, t_i, t_j):
        s_i = self.predictor(x_i)
        s_j = self.predictor(x_j)
        s_diff = s_i - s_j
        if t_i.data > t_j.data:
            S_ij = 1
        elif t_i.data < t_j.data:
            S_ij = -1
        else:
            S_ij = 0
        self.loss = (1 - S_ij) * s_diff / 2. + F.log(1 + F.exp(-s_diff))
        return self.loss



In [8]:
# this is the performance measurement factor, NDCG is used for measuring the perofmance of information 
# retreival models by comparing the ouput ranking with the actualr ranking, the way it works is by 
# having more peanlization on wrong rankings at the top and less weight on the wrong ranking at the bottom
def ndcg(y_true, y_score, k=10):
    y_true = y_true.ravel()
    y_score = y_score.ravel() 
    y_true_sorted = sorted(y_true, reverse=True)
    ideal_dcg = 0
    for i in range(k):
        ideal_dcg += (y_true_sorted[i]) / np.log2(i + 2)
    dcg = 0
    argsort_indices = np.argsort(y_score)[::-1]
    for i in range(k):
        dcg += (y_true[argsort_indices[i]]) / np.log2(i + 2)
    ndcg = dcg / ideal_dcg
    return ndcg

In [9]:
# visualizing query 37 records 
dtmp = X_train[X_train['q_id']==73]
print(dtmp.iloc[:,0:5])
dtmp = y_train[y_train['q_id']==73]
print(dtmp.iloc[:,0:5])

            q_id        c1        c2        c3       c4
article_id                                             
3864          73  0.380952  0.115294  0.000575  0.00096
3865          73  0.238095  0.028558  0.000901  0.00096
3866          73  0.428571  0.105499  0.000681  0.00096
3867          73  0.333333  0.104437  0.000693  0.00096
3868          73  0.380952  0.013335  0.001073  0.00096
3869          73  0.238095  0.025844  0.000936  0.00096
3870          73  0.476190  0.050153  0.000792  0.00096
3871          73  0.380952  0.056998  0.000786  0.00096
3872          73  0.380952  0.052514  0.000813  0.00096
3873          73  0.333333  0.025018  0.000912  0.00096
3874          73  0.190476  0.096413  0.000632  0.00096
3875          73  0.285714  0.084494  0.000683  0.00096
3876          73  0.428571  0.108449  0.000625  0.00096
3877          73  0.428571  0.080245  0.000693  0.00096
3878          73  0.428571  0.100425  0.000650  0.00096
3879          73  0.238095  0.065494  0.000757  

In [10]:
#Model parameters
n_dim = 58
n_iter = 10
n_hidden = 5
loss_step = 2
N_train = np.shape(X_train)[0] 
model = RankNet(MLP(n_dim, n_hidden))
optimizer = optimizers.Adam()
optimizer.setup(model)

N_train = np.shape(X_train)[0]
train_ndcgs = []
test_ndcgs = []

for step in range(n_iter):
    print("______________iteration_____________")
    print(step)
    print("______________train_____________")
    train_ndcg = 0
    test_ndcg = 0
    for qtmp in range(73,277):                                  #training over our training queries
        qx_train = X_train[X_train['q_id']==qtmp]   
        qx_train.drop(["q_id"], axis = 1, inplace = True)
        qx_train = np.float32(qx_train.values)                  #preparing traiing feature vectors
        n_q_items = np.shape(qx_train)[0]
        qy_train = y_train[y_train['q_id']==qtmp]
        qy_train.drop(["q_id"], axis = 1, inplace = True)
        qy_train = np.float32(qy_train.values)                  #preparing training labels
        for itmp in range(0,n_q_items):
             for jtmp in range(itmp+1,n_q_items):
                if itmp != jtmp:
                    x_i = Variable(qx_train[itmp].reshape(1, -1))
                    x_j = Variable(qx_train[jtmp].reshape(1, -1)) 
                    y_i = Variable(qy_train[itmp])
                    y_j = Variable(qy_train[jtmp])
                    loss = model(x_i, x_j, y_i, y_j)           #measuring loss (cost) function
                    model.zerograds()   #zero out gradients because backward propogation accumlates them
                    loss.backward()
                    optimizer.update()
        train_score = model.predictor(Variable(qx_train))
        tmp = ndcg(qy_train, train_score.data)
        train_ndcg = train_ndcg+ tmp
        print("train query")
        print(qtmp)
        print("train_ndcg")
        print(tmp)
    print("______________test_____________")    
    for qtst in range(1,71):
        qx_test = X_test[X_test['q_id']==qtst]
        qx_test.drop(["q_id"], axis = 1, inplace = True)
        n_q_items = np.shape(qx_test)[0]
        qy_test = y_test[y_test['q_id']==qtst]
        qy_test.drop(["q_id"], axis = 1, inplace = True)
        test_score = model.predictor(Variable(np.float32(qx_test.values)))
        tmp = ndcg(np.float32(qy_test.values), test_score.data)
        test_ndcg = test_ndcg + tmp
        print("test query")
        print(qtst)
        print("test_ndcg")
        print(tmp)
if (step + 1) % loss_step == 0:
    train_ndcgs.append(train_ndcg/216)
    test_ndcgs.append(test_ndcg/72)
    print("step: {}".format(step + 1))
    print("NDCG@10 | train: {}, test: {}".format(
        train_ndcg/216, test_ndcg/72))


sns.set_context("poster")
plt.plot(train_ndcgs, label="Train")
plt.plot(test_ndcgs, label="Test")
xx = np.linspace(0, n_iter / loss_step, num=n_iter / loss_step + 1)
labels = np.arange(loss_step, n_iter + 1, loss_step)
plt.xticks(xx, labels, rotation=45)
plt.legend(loc="best")
plt.xlabel("step")
plt.ylabel("NDCG@10")
plt.ylim(0, 1.1)
plt.tight_layout()
plt.show()



______________iteration_____________
0
______________train_____________
train query
73
train_ndcg
0.5926594353696825
train query
74
train_ndcg
0.6453661679467113
train query
75
train_ndcg
0.2275615973439507
train query
76
train_ndcg
0.6307773743003677
train query
77
train_ndcg
0.4065165318490817
train query
78
train_ndcg
0.1430822334687798
train query
79
train_ndcg
0.7255584150894336
train query
80
train_ndcg
0.1367877076751512
train query
81
train_ndcg
0.2251223783473866
train query
82
train_ndcg
0.8354526957709834


KeyboardInterrupt: 