In [1]:
import pandas as pd
import numpy as np
import scipy as sp
from scipy import stats

import os
import subprocess
import sys
import shutil

import codecs

###  We prepare Dataset that will be used in different Machine Learning Models
We will use Logistic Regression and Support Vector Machine to obtain weights to use in Condorcet

In [29]:
path = "C:\\Users\\DavideDP\\AnacondaProjects\\Project\\RankFusion"     
dir_in = "input"   
dir_w = "weights"       
dir_norm = "norm"
dir_comb = "comb"

filename_list = listFiles(path, dir_in)

Dataset = {}   
for filename_in in filename_list:
    path_in = path + "\\" + dir_in + "\\" + filename_in
    in_file = pd.read_csv(path_in, delimiter = " ", header = None)
    in_file.columns = ["topicID", "q0", "docID", "rank", "score", "model"]
    for i in range(0, in_file.shape[0], 1):
        topicID = in_file['topicID'][i]
        documentID = in_file['docID'][i]
        score = in_file['score'][i]
        rank=in_file['rank'][i]
        #si potrebbe anche usare lo score, ma non avrebbe molto senso
        Dataset.setdefault((topicID, documentID), {})
        Dataset[(topicID, documentID)][filename_in]=rank

In [30]:
path_pool="C:\\Users\\DavideDP\\AnacondaProjects\\Project\\terrier-core-4.2\\share\\TIPSTER\\pool\\qrels.trec7.txt"
in_file = pd.read_csv(path_pool, delimiter = " ", header = None)
in_file.columns = ["topicID", "q0", "docID", "state"]

DataY={}
for i in range(0, in_file.shape[0], 1):
    topicID = in_file['topicID'][i]
    documentID = in_file['docID'][i]
    state = in_file['state'][i]
    DataY[(topicID, documentID)]=state

In [31]:
X=[]
Y=[]
for d in Dataset:
    if len(Dataset[d])==10 and (d in DataY) :
        x=[]
        for f in filename_list:
            x.append(Dataset[d][f])
        #print x
        X.append(x)
        Y.append( DataY[d])

X=np.asarray(X)
Y=np.asarray(Y)     
frequencies=sp.stats.itemfreq(Y) 
print(np.asarray(frequencies))

[[    0 10276]
 [    1   943]]


In [32]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)



### We split Dataset in Train e Test 

In [33]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
 X, Y, test_size=0.25, random_state=42)

### Here We try to reduce impact of main weights using logarithm
We print values that will be used in the CondorcetLog

In [34]:
from sklearn import linear_model

logreg = linear_model.LogisticRegression(solver='saga',n_jobs=-1)
logreg=logreg.fit(X_train,y_train)

training_error = 1. - logreg.score(X_train,y_train)
test_error = 1. - logreg.score(X_test,y_test)

print ("Best logistic regression training error: %f" % training_error)
print ("Best logistic regression test error: %f" % test_error)

Best logistic regression training error: 0.084740
Best logistic regression test error: 0.082353




In [35]:
print filename_list
print logreg.coef_

['BB2c1.0_1.res', 'BM25b0.75_0.res', 'DFR_BM25c1.0_2.res', 'DLH_3.res', 'DPH_4.res', 'In_expB2c1.0_5.res', 'LemurTF_IDF_6.res', 'LGDc1.0_7.res', 'PL2c1.0_8.res', 'TF_IDF_9.res']
[[ 0.34881807  0.63175059  0.76863758  0.46072256 -0.04495694 -0.52890744
  -0.75808169 -0.94542233 -0.08658134 -0.52578205]]


### We use Support Vector Machine on Dataset to train it our model and obtain weights that will be used

In [38]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)
clf.fit(X_train, y_train)
training_error = 1. - clf.score(X_train,y_train)
test_error = 1. - clf.score(X_test,y_test)

print ("Best logistic regression training error: %f" % training_error)
print ("Best logistic regression test error: %f" % test_error)

Best logistic regression training error: 0.084621
Best logistic regression test error: 0.082353


### We take weights from SVM Linear Model and we transform them in the final weights that will be used in the CondorcetML

In [96]:
print filename_list
#print(clf.coef_)
v=(clf.coef_*100)[0]
v.shape=(10,1)
#print v
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(v)
scala = scaler.transform(v)
scala=(scala+1.2)*0.5*100
print scala

['n_BB2c1.0_1.res', 'n_BM25b0.75_0.res', 'n_DFR_BM25c1.0_2.res', 'n_DLH_3.res', 'n_DPH_4.res', 'n_In_expB2c1.0_5.res', 'n_LemurTF_IDF_6.res', 'n_LGDc1.0_7.res', 'n_PL2c1.0_8.res', 'n_TF_IDF_9.res']
[[109.80170621]
 [131.37245762]
 [127.43305118]
 [ 91.39328224]
 [ 60.49551915]
 [  5.12221583]
 [ 11.63204066]
 [  5.67932307]
 [ 56.91590495]
 [  0.15449908]]


### Here We try to reduce impact of main weights using logarithm
We print values that will be used in the CondorcetLog

In [99]:
print(np.log2(scala))

[[ 6.77875666]
 [ 7.03751903]
 [ 6.99359569]
 [ 6.51401622]
 [ 5.91875638]
 [ 2.35676804]
 [ 3.54003231]
 [ 2.50571898]
 [ 5.83075996]
 [-2.69432984]]
