Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Merged SVD Implicit Feature

  • Loading branch information...
commit 0039fa2dd710ccf5868d76d607df62d18c83cb75 1 parent c7a67eb
ChrisRackauckas authored
View
7 PreProcess/preProcess.py
@@ -23,6 +23,9 @@ def preProcess(os,utils,random,DE_EFFECT,userMovieRating,TEST_SUBSET,PROCESS_TAG
p.start()
processes.append(p)
+ ### Baidu Dataset Specific ###
+ ### Preprocesses the Baidu extra features data ###
+
if PROCESS_TAGS:
print('... Processing Movie Tag Data')
p=mproc.Process(target=processMovieTags,
@@ -41,9 +44,13 @@ def preProcess(os,utils,random,DE_EFFECT,userMovieRating,TEST_SUBSET,PROCESS_TAG
args=(utils.USER_HISTORY_PATH,utils.PROCESSED_HISTORY,utils.ORIGINAL_DATA_PATH))
p.start()
processes.append(p)
+
+
for p in processes:
p.join()
+ ### End Baidu Dataset Specific ###
+
# De-effects data file
if DE_EFFECT:
deEffectData(utils.ORIGINAL_DATA_CLEAN_PATH,
View
36 config.py
@@ -1,16 +1,17 @@
################### Select Models ##################
models = [
- ['basicFM','FM','Basic',['2']],
- ['basicMovTag','FM','BasicMovieTag',['2']],
- ['nearNeib', 'FM', 'NearestNeighbor',['2']],
- ['rmtThresh5t','FM','RelatedMovieTagThreshold',['2']],
- ['rmtThresh2','FM','RelatedMovieTagThreshold2',['2']],
- ['userHist','FM','UserHistory',['2']],
- ['userSocial','FM','UserSocial',['2']]
-
- #['basicSVD','SVD','Basic',[]]
- ]
+ ['basicFM','FM','Basic',['2']],
+ #['basicMovTag','FM','BasicMovieTag',['2']],
+ #['nearNeib', 'FM', 'NearestNeighbor',['2']],
+ #['rmtThresh5t','FM','RelatedMovieTagThreshold',['2']],
+ #['rmtThresh2','FM','RelatedMovieTagThreshold2',['2']],
+ #['userHist','FM','UserHistory',['2']],
+ #['userSocial','FM','UserSocial',['2']]
+
+ ['basicSVD','SVD','Basic',[]],
+ ['ImplicitFeedbackSVD','SVD','ImplicitFeedback',[]]
+ ]
# Defining models:
# Each element is a list:
@@ -28,8 +29,8 @@
['BRT','BRT',[]],
['BMAR','BMAR',[]],
#['RFR' ,'RFR' ,[]], # Large memory requirement
- ['Lasso', 'Lasso', []],
- ['GBRT','GBRT',['10']]
+ ['Lasso', 'Lasso', []]
+ #['GBRT','GBRT',['10']]
]
# Defining ensemble models:
@@ -42,16 +43,21 @@
synthModel = ['GBRT','GBRT',['10']]
################### Select Parts ##################
+LAPTOP_TEST = True # uses small data set to run features on laptop
TRIALS = 1
PRE_PROCESS = True
# ---- ---- PreProcess Selection ---- ---- #
TEST_SUBSET = True # uses small data set
-PROCESS_TAGS = True # generates new file for movie tag feature
+
+### Baidu Specific Preprocess ###
+PROCESS_TAGS = False # generates new file for movie tag feature
PROCESS_SOCIAL = True # cuts out all the extra social users not in data set
PROCESS_HISTORY = True
-DE_EFFECT = False # If De-effect is false, model predictions are correct
+### End Baidu Specific ####
+
+DE_EFFECT = False # If De-effect is false, intermittent predictions are correct
# ---- ---- ---- ---- ----- ---- ---- ---- #
SETUP_MODELS = True
RUN_MODELS = True
@@ -82,6 +88,8 @@
SVD_REGULARIZATION_ITEM = '.004'
SVD_REGULARIZATION_USER = '.004'
SVD_REGULARIZATION_GLOBAL = '.001'
+
+SVD_REGULARIZATION_FEEDBACK = '.004'
SVD_NUM_FACTOR = '64'
SVD_ACTIVE_TYPE = '0'
SVD_NUM_ITER = '40'
View
3  utils/FMModel.py
@@ -123,6 +123,8 @@ def setupFeatures(self):
self.addNearestNeighbor(self.bootCV,self.featCV,moviesRatedByUserDict,movieLocationDict,'CV')
self.addNearestNeighbor(self.bootTest,self.featTest,moviesRatedByUserDict,movieLocationDict,'test')
+ ### Baidu Dataset Specific Features ###
+
# ---- ---- Movie Tag Features ---- ---- #
elif self.featureSet == 'BasicMovieTag':
@@ -178,6 +180,7 @@ def setupFeatures(self):
self.userSocial(self.bootCV,self.featCV,userLocationDict,movieLocationDict,userSocialDict,'CV')
self.userSocial(self.bootTest,self.featTest,userLocationDict,movieLocationDict,userSocialDict,'test')
+ ### End Baidu Dataset Specific Features ###
def addNearestNeighbor(self,finPath, foutPath,moviesRatedByUserDict,movieLocationDict,step):
#-----------------------------------------------------------------
View
116 utils/ImplicitFeedbackFunctions.py
@@ -0,0 +1,116 @@
+def reIndex_Implicit(fin):
+ print("Reindexing Origin Data Set and Building the Correspondence Dics")
+ fi = open( fin, 'r' ) #training set
+ #extract from input file
+ uidDic={}
+ iidDic={}
+ newuid=1
+ newiid=1
+ ctr=0 # is the counter of the total number.
+ sum=0.0
+
+ for line in fi:
+ arr = line.split()
+ uid = int(arr[0].strip())
+ iid = int(arr[1].strip())
+ rating = int(float(arr[2].strip()))
+ #this part for calculating the average
+ sum+=rating
+ ctr+=1
+
+ #this part for reindexing the user ID
+ if uid not in uidDic:
+ uidDic[uid]=newuid
+ newuid+=1
+ #this part for reindexing the item ID
+ if iid not in iidDic:
+ iidDic[iid]=newiid
+ newiid+=1
+
+ fi.close()
+ #calculate different parameter.
+ avg=sum/ctr
+ print("Finished")
+ return(uidDic,iidDic,avg)
+
+
+def translate(fin,fout,Udic,ItemDic):
+ print("Start Translation. Translating " +fin+" .")
+ fi=open(fin,'r')
+ fo=open(fout,'w')
+ #translate the file
+ for line in fi:
+ arr=line.split()
+ uid=int(arr[0].strip())
+ iid=int(arr[1].strip())
+ if len(arr)>2:
+ rating=str(int(float(arr[2].strip())))
+ if uid in Udic:
+ if iid in ItemDic:
+ if len(arr)>2:
+ writeline=str(Udic[uid])+'\t'+str(ItemDic[iid])+'\t'+rating+'\r\n'
+ else:
+ writeline=str(Udic[uid])+'\t'+str(ItemDic[iid])+'\r\n'
+ fo.write(writeline)
+
+ fi.close()
+ fo.close()
+ print("Translation Finished.")
+
+
+def userfeedback(fname):
+ fi = open(fname,'r')
+ feedback = {}
+ for line in fi:
+ attr = line.strip().split('\t')
+ uid = int(attr[0])-1
+ iid = int(attr[1])-1
+ if uid in feedback:
+ feedback[uid].append(iid)
+ else:
+ feedback[uid] = [iid]
+ fi.close()
+ return feedback
+
+#group num and order of the grouped training data
+
+
+def usergroup(fname):
+ fi = open(fname,'r')
+ userorder = []
+ groupnum = {}
+ lastuid = -1
+ for line in fi:
+ attr = line.strip().split('\t')
+ uid = int(attr[0])-1
+ if uid in groupnum:
+ groupnum[uid] += 1
+ else:
+ groupnum[uid] = 1
+ if uid != lastuid:
+ userorder.append(uid)
+ lastuid = uid
+ fi.close()
+ return userorder,groupnum
+
+#make implict feedback feature, one line for a user, wihch is in the order of the grouped training data
+#the output format:rate \t number of user group \t number of user implicit feedback \t fid1:fvalue1, fid2:fvalue2 ... \n
+
+
+def mkfeature(fout,userorder,groupnum,feedback):
+ fo = open(fout,'w')
+ for uid in userorder:
+ gnum = groupnum[uid]
+ fnum = len(feedback[uid])
+ fo.write('%d\t%d\t' %(gnum,fnum))
+ for i in feedback[uid]:
+ fo.write('%d:%.6f ' %(i,pow(fnum,-0.5)))
+ fo.write('\n')
+
+
+def mkImplicitFeatureFile(ftrain,fgtrain,fout):
+ '''usage:<training_file> <grouped training_file> <output>'''
+ feedback = userfeedback(ftrain)
+ userorder,groupnum = usergroup(fgtrain)
+ #make features and print them out in file fout
+ mkfeature(fout,userorder,groupnum,feedback)
View
5 utils/Model.py
@@ -10,19 +10,20 @@ def __init__(self,configModel,utils,strTrial):
self.bootCV = utils.MODEL_BOOT_PATH + \
'CV' + '_t' + strTrial
self.bootTest = utils.MODEL_BOOT_PATH + \
- 'test' + '_t' + strTrial
+ 'test' + '_t' + strTrial
self.featTrain = utils.MODEL_FEATURED_PATH + self.tag + \
'_train' + '_t' + strTrial
self.featCV = utils.MODEL_FEATURED_PATH + self.tag + \
'_CV' + '_t' + strTrial
self.featTest = utils.MODEL_FEATURED_PATH + self.tag + \
'_test' + '_t' + strTrial
+
self.tmpTrain = utils.MODEL_TMP_PATH + self.tag + \
'_train' + '_t' + strTrial
self.tmpCV = utils.MODEL_TMP_PATH + self.tag + \
'_CV' + '_t' + strTrial
self.tmpTest = utils.MODEL_TMP_PATH + self.tag + \
- '_test'+ '_t' + strTrial
+ '_test'+ '_t' + strTrial
self.runTrain = utils.MODEL_RUN_PATH + self.tag + \
'_train' + '_t' + strTrial
self.runCV = utils.MODEL_RUN_PATH + self.tag + \
View
147 utils/SVDModel.py
@@ -1,4 +1,5 @@
from Model import Model
+import ImplicitFeedbackFunctions as IFF #IFF for implicitFeedbackFunctions
class SVDModel(Model):
### Construct ###
@@ -7,8 +8,34 @@ def __init__(self,configModel,utils,config,strTrial):
Model.__init__(self,configModel,utils,strTrial)
self.configPath = utils.MODEL_CONFIG_PATH + self.tag + \
'_t' + strTrial
+ self.userHistoryReindexPath= utils.MODEL_TMP_PATH + self.tag + \
+ '_userHistoryReindex' + '_t' + strTrial
+
+ ### Baidu Specific ###
+ ### Implicit Feedback Files ###
+ #The following 3 files are implicit feature files
+ self.ImfeatTrain = utils.MODEL_FEATURED_PATH + self.tag + \
+ '_Imtrain' + '_t' + strTrial
+ self.ImfeatCV = utils.MODEL_FEATURED_PATH + self.tag + \
+ '_ImCV' + '_t' + strTrial
+ self.ImfeatTest = utils.MODEL_FEATURED_PATH + self.tag + \
+ '_Imtest' + '_t' + strTrial
+ #Gp for group training file, the test file is already in group format,so skip it
+ self.tmpGpTrain = utils.MODEL_TMP_PATH + self.tag + \
+ '_Gptrain' + '_t' + strTrial
+ self.tmpGpCV = utils.MODEL_TMP_PATH + self.tag + \
+ '_GpCV' + '_t' + strTrial
+ #for storing the line order of the group file
+ self.tmpLineOrder = utils.MODEL_TMP_PATH + self.tag + \
+ '_LineOrder' + '_t' + strTrial
+ ### End Implicit Feature Files ###
+
+ self.regularizationFeedback = config.SVD_REGULARIZATION_FEEDBACK
+ ### End Baidu Specific ###
+
self.numIter = config.SVD_NUM_ITER
self.SVDBufferPath = utils.SVDFEATURE_BUFFER_BINARY
+ self.SVDGroupBufferPath = utils.SVDFEATURE_GROUP_BUFFER_BINARY
self.learningRate = config.SVD_LEARNING_RATE
self.regularizationItem = config.SVD_REGULARIZATION_ITEM
self.regularizationUser = config.SVD_REGULARIZATION_USER
@@ -18,13 +45,22 @@ def __init__(self,configModel,utils,config,strTrial):
self.modelOutPath = utils.SVDFEATURE_MODEL_OUT_PATH
self.SVDFeatureBinary = utils.SVDFEATURE_BINARY
self.SVDFeatureInferBinary= utils.SVDFEATURE_INFER_BINARY
-
+ self.SVDFeatureLineReorder= utils.SVDFEATURE_LINE_REORDER
+ self.SVDFeatureSVDPPRandOrder = utils.SVDFEATURE_SVDPP_RANDORDER
+ self.formatType = 0
+ self.numUserFeedback = 0
+ self.originDataSet = utils.ORIGINAL_DATA_PATH
+ # 0 is the default value
+
### Setup Data ###
def setup(self):
- ### Boot to tmp ###
- print("Re-Indexing")
- values = self.reIndex()
+ import utils
+ import config
+ if self.featureSet == 'Basic':
+ ### Boot to tmp ###
+ print("Re-Indexing")
+ values = self.reIndex()
### Take tmp to feat ###
print("Setting Up Features")
self.setupFeatures()
@@ -48,6 +84,11 @@ def reIndex(self):
CVLines = bootCVFile.readlines()
testLines = bootTestFile.readlines()
+ fullInput = []
+ fullInput.append(trainLines)
+ fullInput.append(CVLines)
+ fullInput.append(testLines)
+
uidDic={}
iidDic={}
newuid=1
@@ -56,7 +97,7 @@ def reIndex(self):
sum=0.0
#Build dictionary
-
+
for line in trainLines:
arr = line.rsplit('\t')
uid = int(arr[0].strip())
@@ -66,15 +107,15 @@ def reIndex(self):
sum+=rating
ctr+=1
- #this part for reindexing the user ID
+ #this part for reindexing the user ID
if uid not in uidDic:
uidDic[uid]=newuid
newuid+=1
- #this part for reindexing the item ID
+ #this part for reindexing the item ID
if iid not in iidDic:
iidDic[iid]=newiid
newiid+=1
-
+
for line in CVLines:
arr = line.rsplit('\t')
uid = int(arr[0].strip())
@@ -83,20 +124,20 @@ def reIndex(self):
if uid not in uidDic:
uidDic[uid]=newuid
newuid+=1
- #this part for reindexing the item ID
+ #this part for reindexing the item ID
if iid not in iidDic:
iidDic[iid]=newiid
newiid+=1
-
+
for line in testLines:
arr = line.rsplit('\t')
uid = int(arr[0].strip())
iid = int(arr[1].strip())
- #this part for reindexing the user ID
+ #this part for reindexing the user ID
if uid not in uidDic:
uidDic[uid]=newuid
newuid+=1
- #this part for reindexing the item ID
+ #this part for reindexing the item ID
if iid not in iidDic:
iidDic[iid]=newiid
newiid+=1
@@ -122,12 +163,12 @@ def reIndex(self):
tmpTestFile.write('%d\t%d\t%d\n' %(uidDic[uid],iidDic[iid],rating))
#calculate different parameter.
-
+
self.numUser=len(uidDic)
self.numMovie=len(iidDic)
self.avg=sum/ctr
self.numGlobal = 0
-
+
#Close files
bootTrainFile.close()
bootTestFile.close()
@@ -138,13 +179,20 @@ def reIndex(self):
def dataConvert(self):
import os
- os.system(self.SVDBufferPath + ' ' +
- self.featTrain + ' ' + self.runTrain)
- os.system(self.SVDBufferPath + ' ' +
- self.featCV + ' ' + self.runCV )
- os.system(self.SVDBufferPath + ' ' +
- self.featTest + ' ' + self.runTest )
-
+ if self.featureSet == 'Basic':
+ os.system(self.SVDBufferPath + ' ' +
+ self.featTrain + ' ' + self.runTrain)
+ os.system(self.SVDBufferPath + ' ' +
+ self.featCV + ' ' + self.runCV )
+ os.system(self.SVDBufferPath + ' ' +
+ self.featTest + ' ' + self.runTest )
+ if self.featureSet == 'ImplicitFeedback':
+ os.system(self.SVDGroupBufferPath + ' ' + self.featTrain + \
+ ' ' + self.runTrain + ' ' + '-fd' + ' ' + self.ImfeatTrain)
+ os.system(self.SVDGroupBufferPath + ' ' + self.featCV + \
+ ' ' + self.runCV + ' ' + '-fd' + ' ' + self.ImfeatCV)
+ os.system(self.SVDGroupBufferPath + ' ' + self.featTest + \
+ ' ' + self.runTest + ' ' + '-fd' + ' ' + self.ImfeatTest)
def writeConfig(self):
import os
fout = open(self.configPath,'w')
@@ -170,17 +218,26 @@ def writeConfig(self):
fout.write('#Model save path\n')
fout.write('model_out_folder = \"' + self.modelOutPath
+ self.tag + '_t' + self.trial + '\"')
+ if self.featureSet == 'ImplicitFeedback':
+ fout.write('\n')
+ fout.write("format_type = " + str(self.formatType) + '\n')
+ fout.write("num_ufeedback = " + str(self.numUserFeedback) + '\n')
+ fout.write("wd_ufeedback = " + self.regularizationFeedback + '\n')
+
os.system('mkdir ' + self.modelOutPath
+ self.tag + '_t' + self.trial)
fout.close()
### Setup Features ###
-
def setupFeatures(self):
if self.featureSet == 'Basic':
self.basicConvert(self.tmpTrain,self.featTrain)
self.basicConvert(self.tmpCV, self.featCV)
self.basicConvert(self.tmpTest, self.featTest)
+ ### Baidu Specific Features ###
+ if self.featureSet == 'ImplicitFeedback':
+ self.setupImplicitFeatures()
+ ### End Baidu Specific Features ###
def basicConvert(self,fin,fout):
fi = open( fin , 'r' )
@@ -197,7 +254,51 @@ def basicConvert(self,fin,fout):
fi.close()
fo.close()
-### Run ###
+ def setupImplicitFeatures(self):
+ import os
+
+ #reindex the training files and build two dicts
+ Udic,ItemDic,avg=IFF.reIndex_Implicit(self.originDataSet)
+ #reindex the history
+ IFF.translate(self.userHistoryPath, self.userHistoryReindexPath, Udic, ItemDic)
+ #reindex CV file
+ IFF.translate(self.bootCV, self.tmpCV, Udic, ItemDic)
+ #reindex Testfile
+ IFF.translate(self.bootTest, self.tmpTest, Udic, ItemDic)
+ #reindex the training
+ IFF.translate(self.bootTrain,self.tmpTrain,Udic,ItemDic)
+
+ #make group training files
+ os.system(self.SVDFeatureSVDPPRandOrder +' '+ self.tmpTrain + ' ' + self.tmpLineOrder)
+ os.system(self.SVDFeatureLineReorder + ' ' + self.tmpTrain + ' ' + self.tmpLineOrder + ' ' + self.tmpGpTrain)
+
+ #make group training files of the CV set
+ os.system(self.SVDFeatureSVDPPRandOrder +' '+ self.tmpCV + \
+ ' '+ self.tmpLineOrder)
+ os.system(self.SVDFeatureLineReorder + ' ' + self.tmpCV + \
+ ' ' + self.tmpLineOrder + ' ' + self.tmpGpCV)
+
+ #make basic feature files
+ self.basicConvert(self.tmpGpTrain,self.featTrain)
+ self.basicConvert(self.tmpGpCV, self.featCV)
+ self.basicConvert(self.tmpTest, self.featTest)
+
+ #make implicit feature files
+ IFF.mkImplicitFeatureFile(self.userHistoryReindexPath,self.tmpGpTrain,self.ImfeatTrain)
+ IFF.mkImplicitFeatureFile(self.userHistoryReindexPath,self.tmpTest,self.ImfeatTest)
+ IFF.mkImplicitFeatureFile(self.userHistoryReindexPath,self.tmpGpCV,self.ImfeatCV)
+
+
+ #set different parameters
+ self.numUser=len(Udic)
+ self.numMovie=len(ItemDic)
+ self.avg=avg
+ self.numGlobal = 0
+ self.activeType = '0'
+ self.formatType = 1
+ self.numUserFeedback = len(ItemDic)
+
+ ### Run ###
def run(self,sproc,subprocesses):
p = sproc.Popen(self.SVDFeatureBinary + ' ' + self.configPath +
View
11 utils/utils.py
@@ -53,10 +53,13 @@
#### SVD Feature ###
-SVDFEATURE_BUFFER_BINARY = './Models/SVDFeature/tools/make_feature_buffer'
-SVDFEATURE_BINARY = './Models/SVDFeature/svd_feature'
-SVDFEATURE_INFER_BINARY = './Models/SVDFeature/svd_feature_infer'
-SVDFEATURE_MODEL_OUT_PATH = 'Data/ModelData/'
+SVDFEATURE_BUFFER_BINARY = './Models/SVDFeature/tools/make_feature_buffer'
+SVDFEATURE_GROUP_BUFFER_BINARY = './Models/SVDFeature/tools/make_ugroup_buffer'
+SVDFEATURE_LINE_REORDER = './Models/SVDFeature/tools/line_reorder'
+SVDFEATURE_SVDPP_RANDORDER = './Models/SVDFeature/tools/svdpp_randorder'
+SVDFEATURE_BINARY = './Models/SVDFeature/svd_feature'
+SVDFEATURE_INFER_BINARY = './Models/SVDFeature/svd_feature_infer'
+SVDFEATURE_MODEL_OUT_PATH = 'Data/ModelData/'
#### Utility Functions ####
Please sign in to comment.
Something went wrong with that request. Please try again.