From 0039fa2dd710ccf5868d76d607df62d18c83cb75 Mon Sep 17 00:00:00 2001 From: ChrisRackauckas Date: Thu, 1 Aug 2013 14:53:14 -0400 Subject: [PATCH] Merged SVD Implicit Feature --- PreProcess/preProcess.py | 7 ++ config.py | 36 ++++--- utils/FMModel.py | 3 + utils/ImplicitFeedbackFunctions.py | 116 +++++++++++++++++++++++ utils/Model.py | 5 +- utils/SVDModel.py | 147 ++++++++++++++++++++++++----- utils/utils.py | 11 ++- 7 files changed, 282 insertions(+), 43 deletions(-) create mode 100644 utils/ImplicitFeedbackFunctions.py diff --git a/PreProcess/preProcess.py b/PreProcess/preProcess.py index 63aab71..74a484a 100644 --- a/PreProcess/preProcess.py +++ b/PreProcess/preProcess.py @@ -23,6 +23,9 @@ def preProcess(os,utils,random,DE_EFFECT,userMovieRating,TEST_SUBSET,PROCESS_TAG p.start() processes.append(p) + ### Baidu Dataset Specific ### + ### Preprocesses the Baidu extra features data ### + if PROCESS_TAGS: print('... Processing Movie Tag Data') p=mproc.Process(target=processMovieTags, @@ -41,9 +44,13 @@ def preProcess(os,utils,random,DE_EFFECT,userMovieRating,TEST_SUBSET,PROCESS_TAG args=(utils.USER_HISTORY_PATH,utils.PROCESSED_HISTORY,utils.ORIGINAL_DATA_PATH)) p.start() processes.append(p) + + for p in processes: p.join() + ### End Baidu Dataset Specific ### + # De-effects data file if DE_EFFECT: deEffectData(utils.ORIGINAL_DATA_CLEAN_PATH, diff --git a/config.py b/config.py index 0307143..dcbe769 100644 --- a/config.py +++ b/config.py @@ -1,16 +1,17 @@ ################### Select Models ################## models = [ - ['basicFM','FM','Basic',['2']], - ['basicMovTag','FM','BasicMovieTag',['2']], - ['nearNeib', 'FM', 'NearestNeighbor',['2']], - ['rmtThresh5t','FM','RelatedMovieTagThreshold',['2']], - ['rmtThresh2','FM','RelatedMovieTagThreshold2',['2']], - ['userHist','FM','UserHistory',['2']], - ['userSocial','FM','UserSocial',['2']] - - #['basicSVD','SVD','Basic',[]] - ] + ['basicFM','FM','Basic',['2']], + #['basicMovTag','FM','BasicMovieTag',['2']], + #['nearNeib', 'FM', 'NearestNeighbor',['2']], + #['rmtThresh5t','FM','RelatedMovieTagThreshold',['2']], + #['rmtThresh2','FM','RelatedMovieTagThreshold2',['2']], + #['userHist','FM','UserHistory',['2']], + #['userSocial','FM','UserSocial',['2']] + + ['basicSVD','SVD','Basic',[]], + ['ImplicitFeedbackSVD','SVD','ImplicitFeedback',[]] + ] # Defining models: # Each element is a list: @@ -28,8 +29,8 @@ ['BRT','BRT',[]], ['BMAR','BMAR',[]], #['RFR' ,'RFR' ,[]], # Large memory requirement - ['Lasso', 'Lasso', []], - ['GBRT','GBRT',['10']] + ['Lasso', 'Lasso', []] + #['GBRT','GBRT',['10']] ] # Defining ensemble models: @@ -42,16 +43,21 @@ synthModel = ['GBRT','GBRT',['10']] ################### Select Parts ################## +LAPTOP_TEST = True # uses small data set to run features on laptop TRIALS = 1 PRE_PROCESS = True # ---- ---- PreProcess Selection ---- ---- # TEST_SUBSET = True # uses small data set -PROCESS_TAGS = True # generates new file for movie tag feature + +### Baidu Specific Preprocess ### +PROCESS_TAGS = False # generates new file for movie tag feature PROCESS_SOCIAL = True # cuts out all the extra social users not in data set PROCESS_HISTORY = True -DE_EFFECT = False # If De-effect is false, model predictions are correct +### End Baidu Specific #### + +DE_EFFECT = False # If De-effect is false, intermittent predictions are correct # ---- ---- ---- ---- ----- ---- ---- ---- # SETUP_MODELS = True RUN_MODELS = True @@ -82,6 +88,8 @@ SVD_REGULARIZATION_ITEM = '.004' SVD_REGULARIZATION_USER = '.004' SVD_REGULARIZATION_GLOBAL = '.001' + +SVD_REGULARIZATION_FEEDBACK = '.004' SVD_NUM_FACTOR = '64' SVD_ACTIVE_TYPE = '0' SVD_NUM_ITER = '40' diff --git a/utils/FMModel.py b/utils/FMModel.py index d25be92..dc1386b 100644 --- a/utils/FMModel.py +++ b/utils/FMModel.py @@ -123,6 +123,8 @@ def setupFeatures(self): self.addNearestNeighbor(self.bootCV,self.featCV,moviesRatedByUserDict,movieLocationDict,'CV') self.addNearestNeighbor(self.bootTest,self.featTest,moviesRatedByUserDict,movieLocationDict,'test') + ### Baidu Dataset Specific Features ### + # ---- ---- Movie Tag Features ---- ---- # elif self.featureSet == 'BasicMovieTag': @@ -178,6 +180,7 @@ def setupFeatures(self): self.userSocial(self.bootCV,self.featCV,userLocationDict,movieLocationDict,userSocialDict,'CV') self.userSocial(self.bootTest,self.featTest,userLocationDict,movieLocationDict,userSocialDict,'test') + ### End Baidu Dataset Specific Features ### def addNearestNeighbor(self,finPath, foutPath,moviesRatedByUserDict,movieLocationDict,step): #----------------------------------------------------------------- diff --git a/utils/ImplicitFeedbackFunctions.py b/utils/ImplicitFeedbackFunctions.py new file mode 100644 index 0000000..0025c81 --- /dev/null +++ b/utils/ImplicitFeedbackFunctions.py @@ -0,0 +1,116 @@ +def reIndex_Implicit(fin): + print("Reindexing Origin Data Set and Building the Correspondence Dics") + fi = open( fin, 'r' ) #training set + #extract from input file + uidDic={} + iidDic={} + newuid=1 + newiid=1 + ctr=0 # is the counter of the total number. + sum=0.0 + + for line in fi: + arr = line.split() + uid = int(arr[0].strip()) + iid = int(arr[1].strip()) + rating = int(float(arr[2].strip())) + #this part for calculating the average + sum+=rating + ctr+=1 + + #this part for reindexing the user ID + if uid not in uidDic: + uidDic[uid]=newuid + newuid+=1 + #this part for reindexing the item ID + if iid not in iidDic: + iidDic[iid]=newiid + newiid+=1 + + fi.close() + #calculate different parameter. + avg=sum/ctr + print("Finished") + return(uidDic,iidDic,avg) + + +def translate(fin,fout,Udic,ItemDic): + print("Start Translation. Translating " +fin+" .") + fi=open(fin,'r') + fo=open(fout,'w') + #translate the file + for line in fi: + arr=line.split() + uid=int(arr[0].strip()) + iid=int(arr[1].strip()) + if len(arr)>2: + rating=str(int(float(arr[2].strip()))) + if uid in Udic: + if iid in ItemDic: + if len(arr)>2: + writeline=str(Udic[uid])+'\t'+str(ItemDic[iid])+'\t'+rating+'\r\n' + else: + writeline=str(Udic[uid])+'\t'+str(ItemDic[iid])+'\r\n' + fo.write(writeline) + + fi.close() + fo.close() + print("Translation Finished.") + + +def userfeedback(fname): + fi = open(fname,'r') + feedback = {} + for line in fi: + attr = line.strip().split('\t') + uid = int(attr[0])-1 + iid = int(attr[1])-1 + if uid in feedback: + feedback[uid].append(iid) + else: + feedback[uid] = [iid] + fi.close() + return feedback + +#group num and order of the grouped training data + + +def usergroup(fname): + fi = open(fname,'r') + userorder = [] + groupnum = {} + lastuid = -1 + for line in fi: + attr = line.strip().split('\t') + uid = int(attr[0])-1 + if uid in groupnum: + groupnum[uid] += 1 + else: + groupnum[uid] = 1 + if uid != lastuid: + userorder.append(uid) + lastuid = uid + fi.close() + return userorder,groupnum + +#make implict feedback feature, one line for a user, wihch is in the order of the grouped training data +#the output format:rate \t number of user group \t number of user implicit feedback \t fid1:fvalue1, fid2:fvalue2 ... \n + + +def mkfeature(fout,userorder,groupnum,feedback): + fo = open(fout,'w') + for uid in userorder: + gnum = groupnum[uid] + fnum = len(feedback[uid]) + fo.write('%d\t%d\t' %(gnum,fnum)) + for i in feedback[uid]: + fo.write('%d:%.6f ' %(i,pow(fnum,-0.5))) + fo.write('\n') + + +def mkImplicitFeatureFile(ftrain,fgtrain,fout): + '''usage: ''' + feedback = userfeedback(ftrain) + userorder,groupnum = usergroup(fgtrain) + #make features and print them out in file fout + mkfeature(fout,userorder,groupnum,feedback) diff --git a/utils/Model.py b/utils/Model.py index fa3291e..ed1207b 100644 --- a/utils/Model.py +++ b/utils/Model.py @@ -10,19 +10,20 @@ def __init__(self,configModel,utils,strTrial): self.bootCV = utils.MODEL_BOOT_PATH + \ 'CV' + '_t' + strTrial self.bootTest = utils.MODEL_BOOT_PATH + \ - 'test' + '_t' + strTrial + 'test' + '_t' + strTrial self.featTrain = utils.MODEL_FEATURED_PATH + self.tag + \ '_train' + '_t' + strTrial self.featCV = utils.MODEL_FEATURED_PATH + self.tag + \ '_CV' + '_t' + strTrial self.featTest = utils.MODEL_FEATURED_PATH + self.tag + \ '_test' + '_t' + strTrial + self.tmpTrain = utils.MODEL_TMP_PATH + self.tag + \ '_train' + '_t' + strTrial self.tmpCV = utils.MODEL_TMP_PATH + self.tag + \ '_CV' + '_t' + strTrial self.tmpTest = utils.MODEL_TMP_PATH + self.tag + \ - '_test'+ '_t' + strTrial + '_test'+ '_t' + strTrial self.runTrain = utils.MODEL_RUN_PATH + self.tag + \ '_train' + '_t' + strTrial self.runCV = utils.MODEL_RUN_PATH + self.tag + \ diff --git a/utils/SVDModel.py b/utils/SVDModel.py index 504a129..fb5adb2 100644 --- a/utils/SVDModel.py +++ b/utils/SVDModel.py @@ -1,4 +1,5 @@ from Model import Model +import ImplicitFeedbackFunctions as IFF #IFF for implicitFeedbackFunctions class SVDModel(Model): ### Construct ### @@ -7,8 +8,34 @@ def __init__(self,configModel,utils,config,strTrial): Model.__init__(self,configModel,utils,strTrial) self.configPath = utils.MODEL_CONFIG_PATH + self.tag + \ '_t' + strTrial + self.userHistoryReindexPath= utils.MODEL_TMP_PATH + self.tag + \ + '_userHistoryReindex' + '_t' + strTrial + + ### Baidu Specific ### + ### Implicit Feedback Files ### + #The following 3 files are implicit feature files + self.ImfeatTrain = utils.MODEL_FEATURED_PATH + self.tag + \ + '_Imtrain' + '_t' + strTrial + self.ImfeatCV = utils.MODEL_FEATURED_PATH + self.tag + \ + '_ImCV' + '_t' + strTrial + self.ImfeatTest = utils.MODEL_FEATURED_PATH + self.tag + \ + '_Imtest' + '_t' + strTrial + #Gp for group training file, the test file is already in group format,so skip it + self.tmpGpTrain = utils.MODEL_TMP_PATH + self.tag + \ + '_Gptrain' + '_t' + strTrial + self.tmpGpCV = utils.MODEL_TMP_PATH + self.tag + \ + '_GpCV' + '_t' + strTrial + #for storing the line order of the group file + self.tmpLineOrder = utils.MODEL_TMP_PATH + self.tag + \ + '_LineOrder' + '_t' + strTrial + ### End Implicit Feature Files ### + + self.regularizationFeedback = config.SVD_REGULARIZATION_FEEDBACK + ### End Baidu Specific ### + self.numIter = config.SVD_NUM_ITER self.SVDBufferPath = utils.SVDFEATURE_BUFFER_BINARY + self.SVDGroupBufferPath = utils.SVDFEATURE_GROUP_BUFFER_BINARY self.learningRate = config.SVD_LEARNING_RATE self.regularizationItem = config.SVD_REGULARIZATION_ITEM self.regularizationUser = config.SVD_REGULARIZATION_USER @@ -18,13 +45,22 @@ def __init__(self,configModel,utils,config,strTrial): self.modelOutPath = utils.SVDFEATURE_MODEL_OUT_PATH self.SVDFeatureBinary = utils.SVDFEATURE_BINARY self.SVDFeatureInferBinary= utils.SVDFEATURE_INFER_BINARY - + self.SVDFeatureLineReorder= utils.SVDFEATURE_LINE_REORDER + self.SVDFeatureSVDPPRandOrder = utils.SVDFEATURE_SVDPP_RANDORDER + self.formatType = 0 + self.numUserFeedback = 0 + self.originDataSet = utils.ORIGINAL_DATA_PATH + # 0 is the default value + ### Setup Data ### def setup(self): - ### Boot to tmp ### - print("Re-Indexing") - values = self.reIndex() + import utils + import config + if self.featureSet == 'Basic': + ### Boot to tmp ### + print("Re-Indexing") + values = self.reIndex() ### Take tmp to feat ### print("Setting Up Features") self.setupFeatures() @@ -48,6 +84,11 @@ def reIndex(self): CVLines = bootCVFile.readlines() testLines = bootTestFile.readlines() + fullInput = [] + fullInput.append(trainLines) + fullInput.append(CVLines) + fullInput.append(testLines) + uidDic={} iidDic={} newuid=1 @@ -56,7 +97,7 @@ def reIndex(self): sum=0.0 #Build dictionary - + for line in trainLines: arr = line.rsplit('\t') uid = int(arr[0].strip()) @@ -66,15 +107,15 @@ def reIndex(self): sum+=rating ctr+=1 - #this part for reindexing the user ID + #this part for reindexing the user ID if uid not in uidDic: uidDic[uid]=newuid newuid+=1 - #this part for reindexing the item ID + #this part for reindexing the item ID if iid not in iidDic: iidDic[iid]=newiid newiid+=1 - + for line in CVLines: arr = line.rsplit('\t') uid = int(arr[0].strip()) @@ -83,20 +124,20 @@ def reIndex(self): if uid not in uidDic: uidDic[uid]=newuid newuid+=1 - #this part for reindexing the item ID + #this part for reindexing the item ID if iid not in iidDic: iidDic[iid]=newiid newiid+=1 - + for line in testLines: arr = line.rsplit('\t') uid = int(arr[0].strip()) iid = int(arr[1].strip()) - #this part for reindexing the user ID + #this part for reindexing the user ID if uid not in uidDic: uidDic[uid]=newuid newuid+=1 - #this part for reindexing the item ID + #this part for reindexing the item ID if iid not in iidDic: iidDic[iid]=newiid newiid+=1 @@ -122,12 +163,12 @@ def reIndex(self): tmpTestFile.write('%d\t%d\t%d\n' %(uidDic[uid],iidDic[iid],rating)) #calculate different parameter. - + self.numUser=len(uidDic) self.numMovie=len(iidDic) self.avg=sum/ctr self.numGlobal = 0 - + #Close files bootTrainFile.close() bootTestFile.close() @@ -138,13 +179,20 @@ def reIndex(self): def dataConvert(self): import os - os.system(self.SVDBufferPath + ' ' + - self.featTrain + ' ' + self.runTrain) - os.system(self.SVDBufferPath + ' ' + - self.featCV + ' ' + self.runCV ) - os.system(self.SVDBufferPath + ' ' + - self.featTest + ' ' + self.runTest ) - + if self.featureSet == 'Basic': + os.system(self.SVDBufferPath + ' ' + + self.featTrain + ' ' + self.runTrain) + os.system(self.SVDBufferPath + ' ' + + self.featCV + ' ' + self.runCV ) + os.system(self.SVDBufferPath + ' ' + + self.featTest + ' ' + self.runTest ) + if self.featureSet == 'ImplicitFeedback': + os.system(self.SVDGroupBufferPath + ' ' + self.featTrain + \ + ' ' + self.runTrain + ' ' + '-fd' + ' ' + self.ImfeatTrain) + os.system(self.SVDGroupBufferPath + ' ' + self.featCV + \ + ' ' + self.runCV + ' ' + '-fd' + ' ' + self.ImfeatCV) + os.system(self.SVDGroupBufferPath + ' ' + self.featTest + \ + ' ' + self.runTest + ' ' + '-fd' + ' ' + self.ImfeatTest) def writeConfig(self): import os fout = open(self.configPath,'w') @@ -170,17 +218,26 @@ def writeConfig(self): fout.write('#Model save path\n') fout.write('model_out_folder = \"' + self.modelOutPath + self.tag + '_t' + self.trial + '\"') + if self.featureSet == 'ImplicitFeedback': + fout.write('\n') + fout.write("format_type = " + str(self.formatType) + '\n') + fout.write("num_ufeedback = " + str(self.numUserFeedback) + '\n') + fout.write("wd_ufeedback = " + self.regularizationFeedback + '\n') + os.system('mkdir ' + self.modelOutPath + self.tag + '_t' + self.trial) fout.close() ### Setup Features ### - def setupFeatures(self): if self.featureSet == 'Basic': self.basicConvert(self.tmpTrain,self.featTrain) self.basicConvert(self.tmpCV, self.featCV) self.basicConvert(self.tmpTest, self.featTest) + ### Baidu Specific Features ### + if self.featureSet == 'ImplicitFeedback': + self.setupImplicitFeatures() + ### End Baidu Specific Features ### def basicConvert(self,fin,fout): fi = open( fin , 'r' ) @@ -197,7 +254,51 @@ def basicConvert(self,fin,fout): fi.close() fo.close() -### Run ### + def setupImplicitFeatures(self): + import os + + #reindex the training files and build two dicts + Udic,ItemDic,avg=IFF.reIndex_Implicit(self.originDataSet) + #reindex the history + IFF.translate(self.userHistoryPath, self.userHistoryReindexPath, Udic, ItemDic) + #reindex CV file + IFF.translate(self.bootCV, self.tmpCV, Udic, ItemDic) + #reindex Testfile + IFF.translate(self.bootTest, self.tmpTest, Udic, ItemDic) + #reindex the training + IFF.translate(self.bootTrain,self.tmpTrain,Udic,ItemDic) + + #make group training files + os.system(self.SVDFeatureSVDPPRandOrder +' '+ self.tmpTrain + ' ' + self.tmpLineOrder) + os.system(self.SVDFeatureLineReorder + ' ' + self.tmpTrain + ' ' + self.tmpLineOrder + ' ' + self.tmpGpTrain) + + #make group training files of the CV set + os.system(self.SVDFeatureSVDPPRandOrder +' '+ self.tmpCV + \ + ' '+ self.tmpLineOrder) + os.system(self.SVDFeatureLineReorder + ' ' + self.tmpCV + \ + ' ' + self.tmpLineOrder + ' ' + self.tmpGpCV) + + #make basic feature files + self.basicConvert(self.tmpGpTrain,self.featTrain) + self.basicConvert(self.tmpGpCV, self.featCV) + self.basicConvert(self.tmpTest, self.featTest) + + #make implicit feature files + IFF.mkImplicitFeatureFile(self.userHistoryReindexPath,self.tmpGpTrain,self.ImfeatTrain) + IFF.mkImplicitFeatureFile(self.userHistoryReindexPath,self.tmpTest,self.ImfeatTest) + IFF.mkImplicitFeatureFile(self.userHistoryReindexPath,self.tmpGpCV,self.ImfeatCV) + + + #set different parameters + self.numUser=len(Udic) + self.numMovie=len(ItemDic) + self.avg=avg + self.numGlobal = 0 + self.activeType = '0' + self.formatType = 1 + self.numUserFeedback = len(ItemDic) + + ### Run ### def run(self,sproc,subprocesses): p = sproc.Popen(self.SVDFeatureBinary + ' ' + self.configPath + diff --git a/utils/utils.py b/utils/utils.py index 5c6c988..ebfc028 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -53,10 +53,13 @@ #### SVD Feature ### -SVDFEATURE_BUFFER_BINARY = './Models/SVDFeature/tools/make_feature_buffer' -SVDFEATURE_BINARY = './Models/SVDFeature/svd_feature' -SVDFEATURE_INFER_BINARY = './Models/SVDFeature/svd_feature_infer' -SVDFEATURE_MODEL_OUT_PATH = 'Data/ModelData/' +SVDFEATURE_BUFFER_BINARY = './Models/SVDFeature/tools/make_feature_buffer' +SVDFEATURE_GROUP_BUFFER_BINARY = './Models/SVDFeature/tools/make_ugroup_buffer' +SVDFEATURE_LINE_REORDER = './Models/SVDFeature/tools/line_reorder' +SVDFEATURE_SVDPP_RANDORDER = './Models/SVDFeature/tools/svdpp_randorder' +SVDFEATURE_BINARY = './Models/SVDFeature/svd_feature' +SVDFEATURE_INFER_BINARY = './Models/SVDFeature/svd_feature_infer' +SVDFEATURE_MODEL_OUT_PATH = 'Data/ModelData/' #### Utility Functions ####