Merged SVD Implicit Feature

ChrisRackauckas · Aug 1, 2013 · 0039fa2 · 0039fa2
1 parent c7a67eb
commit 0039fa2
Show file tree

Hide file tree

Showing 7 changed files with 282 additions and 43 deletions.
diff --git a/PreProcess/preProcess.py b/PreProcess/preProcess.py
@@ -23,6 +23,9 @@ def preProcess(os,utils,random,DE_EFFECT,userMovieRating,TEST_SUBSET,PROCESS_TAG
     p.start()
     processes.append(p)
 
+    ### Baidu Dataset Specific ###
+    ### Preprocesses the Baidu extra features data ###
+
     if PROCESS_TAGS:
         print('... Processing Movie Tag Data')
         p=mproc.Process(target=processMovieTags,
@@ -41,9 +44,13 @@ def preProcess(os,utils,random,DE_EFFECT,userMovieRating,TEST_SUBSET,PROCESS_TAG
                         args=(utils.USER_HISTORY_PATH,utils.PROCESSED_HISTORY,utils.ORIGINAL_DATA_PATH))
         p.start()
         processes.append(p)
+
+
     for p in processes:
         p.join()
 
+    ### End Baidu Dataset Specific ###
+
     # De-effects data file
     if DE_EFFECT:
         deEffectData(utils.ORIGINAL_DATA_CLEAN_PATH,

diff --git a/config.py b/config.py
@@ -1,16 +1,17 @@
 ################### Select Models ##################
 
 models = [
-           ['basicFM','FM','Basic',['2']],
-           ['basicMovTag','FM','BasicMovieTag',['2']],
-           ['nearNeib', 'FM', 'NearestNeighbor',['2']],
-           ['rmtThresh5t','FM','RelatedMovieTagThreshold',['2']],
-           ['rmtThresh2','FM','RelatedMovieTagThreshold2',['2']],
-           ['userHist','FM','UserHistory',['2']],
-           ['userSocial','FM','UserSocial',['2']]
-
-          #['basicSVD','SVD','Basic',[]]
-         ] 
+          ['basicFM','FM','Basic',['2']],
+          #['basicMovTag','FM','BasicMovieTag',['2']],
+          #['nearNeib', 'FM', 'NearestNeighbor',['2']],
+          #['rmtThresh5t','FM','RelatedMovieTagThreshold',['2']],
+          #['rmtThresh2','FM','RelatedMovieTagThreshold2',['2']],
+          #['userHist','FM','UserHistory',['2']],
+          #['userSocial','FM','UserSocial',['2']]
+
+          ['basicSVD','SVD','Basic',[]],
+          ['ImplicitFeedbackSVD','SVD','ImplicitFeedback',[]]
+          ] 
 
 # Defining models:
 # Each element is a list: 
@@ -28,8 +29,8 @@
                   ['BRT','BRT',[]],
                   ['BMAR','BMAR',[]],
                  #['RFR' ,'RFR' ,[]],  # Large memory requirement
-                  ['Lasso', 'Lasso', []],
-                  ['GBRT','GBRT',['10']]
+                  ['Lasso', 'Lasso', []]
+                 #['GBRT','GBRT',['10']]
 ]
 
 # Defining ensemble models:
@@ -42,16 +43,21 @@
 synthModel = ['GBRT','GBRT',['10']]
 
 ################### Select Parts  ##################
+LAPTOP_TEST      = True # uses small data set to run features on laptop
 
 
 TRIALS           = 1
 PRE_PROCESS      = True
 # ---- ---- PreProcess Selection ---- ---- #
 TEST_SUBSET      = True   # uses small data set
-PROCESS_TAGS     = True  # generates new file for movie tag feature
+
+### Baidu Specific Preprocess ###
+PROCESS_TAGS     = False  # generates new file for movie tag feature
 PROCESS_SOCIAL   = True  # cuts out all the extra social users not in data set
 PROCESS_HISTORY  = True
-DE_EFFECT        = False  # If De-effect is false, model predictions are correct
+### End Baidu Specific ####
+
+DE_EFFECT        = False  # If De-effect is false, intermittent predictions are correct
 # ---- ---- ---- ---- ----- ---- ---- ---- #
 SETUP_MODELS     = True
 RUN_MODELS       = True
@@ -82,6 +88,8 @@
 SVD_REGULARIZATION_ITEM   = '.004'
 SVD_REGULARIZATION_USER   = '.004'
 SVD_REGULARIZATION_GLOBAL = '.001'
+
+SVD_REGULARIZATION_FEEDBACK = '.004'
 SVD_NUM_FACTOR            = '64'
 SVD_ACTIVE_TYPE           = '0'
 SVD_NUM_ITER              = '40'

diff --git a/utils/FMModel.py b/utils/FMModel.py
@@ -123,6 +123,8 @@ def setupFeatures(self):
             self.addNearestNeighbor(self.bootCV,self.featCV,moviesRatedByUserDict,movieLocationDict,'CV')
             self.addNearestNeighbor(self.bootTest,self.featTest,moviesRatedByUserDict,movieLocationDict,'test')
 
+        ### Baidu Dataset Specific Features ###
+
         # ---- ---- Movie Tag Features ---- ---- #
 
         elif self.featureSet == 'BasicMovieTag':
@@ -178,6 +180,7 @@ def setupFeatures(self):
             self.userSocial(self.bootCV,self.featCV,userLocationDict,movieLocationDict,userSocialDict,'CV')
             self.userSocial(self.bootTest,self.featTest,userLocationDict,movieLocationDict,userSocialDict,'test')
 
+        ### End Baidu Dataset Specific Features ###
 
     def addNearestNeighbor(self,finPath, foutPath,moviesRatedByUserDict,movieLocationDict,step):
         #-----------------------------------------------------------------

diff --git a/utils/ImplicitFeedbackFunctions.py b/utils/ImplicitFeedbackFunctions.py
@@ -0,0 +1,116 @@
+def reIndex_Implicit(fin):
+    print("Reindexing Origin Data Set and Building the Correspondence Dics")
+    fi = open( fin, 'r' ) #training set
+    #extract from input file
+    uidDic={}
+    iidDic={}
+    newuid=1
+    newiid=1
+    ctr=0  # is the counter of the total number.
+    sum=0.0
+
+    for line in fi:
+        arr = line.split()
+        uid = int(arr[0].strip())
+        iid = int(arr[1].strip())
+        rating = int(float(arr[2].strip()))
+        #this part for calculating the average
+        sum+=rating
+        ctr+=1
+
+        #this part for reindexing the user ID
+        if uid not in uidDic:
+            uidDic[uid]=newuid
+            newuid+=1
+        #this part for reindexing the item ID
+        if iid not in iidDic:
+            iidDic[iid]=newiid
+            newiid+=1
+
+    fi.close()
+    #calculate different parameter.
+    avg=sum/ctr
+    print("Finished")
+    return(uidDic,iidDic,avg)
+
+
+def translate(fin,fout,Udic,ItemDic):
+    print("Start Translation. Translating " +fin+" .")
+    fi=open(fin,'r')
+    fo=open(fout,'w')
+    #translate the file
+    for line in fi:
+        arr=line.split()
+        uid=int(arr[0].strip())
+        iid=int(arr[1].strip())
+        if len(arr)>2:
+            rating=str(int(float(arr[2].strip())))
+        if uid in Udic:
+            if iid in ItemDic:
+                if len(arr)>2:
+                    writeline=str(Udic[uid])+'\t'+str(ItemDic[iid])+'\t'+rating+'\r\n'
+                else:
+                    writeline=str(Udic[uid])+'\t'+str(ItemDic[iid])+'\r\n'
+                fo.write(writeline)
+
+    fi.close()
+    fo.close()
+    print("Translation Finished.")
+
+
+def userfeedback(fname):
+    fi = open(fname,'r')
+    feedback = {}
+    for line in fi:
+        attr = line.strip().split('\t')
+        uid = int(attr[0])-1
+        iid = int(attr[1])-1
+        if uid in feedback:
+            feedback[uid].append(iid)
+        else:
+            feedback[uid] = [iid]
+    fi.close()
+    return feedback
+
+#group num and order of the grouped training data
+
+
+def usergroup(fname):
+    fi = open(fname,'r')
+    userorder = []
+    groupnum = {}
+    lastuid = -1
+    for line in fi:
+        attr = line.strip().split('\t')
+        uid = int(attr[0])-1
+        if uid in groupnum:
+            groupnum[uid] += 1
+        else:
+            groupnum[uid] = 1
+        if uid != lastuid:
+            userorder.append(uid)
+        lastuid = uid
+    fi.close()
+    return userorder,groupnum
+
+#make implict feedback feature, one line for a user, wihch is in the order of the grouped training data 
+#the output format:rate \t number of user group \t number of user implicit feedback \t fid1:fvalue1, fid2:fvalue2 ... \n
+
+
+def mkfeature(fout,userorder,groupnum,feedback):
+    fo = open(fout,'w')
+    for uid in userorder:
+        gnum = groupnum[uid]
+        fnum = len(feedback[uid])
+        fo.write('%d\t%d\t' %(gnum,fnum))
+        for i in feedback[uid]:
+            fo.write('%d:%.6f ' %(i,pow(fnum,-0.5)))
+        fo.write('\n')
+
+
+def mkImplicitFeatureFile(ftrain,fgtrain,fout):
+    '''usage:<training_file> <grouped training_file> <output>'''
+    feedback = userfeedback(ftrain)
+    userorder,groupnum = usergroup(fgtrain)
+    #make features and print them  out in file fout 
+    mkfeature(fout,userorder,groupnum,feedback)
diff --git a/utils/Model.py b/utils/Model.py
@@ -10,19 +10,20 @@ def __init__(self,configModel,utils,strTrial):
         self.bootCV     =  utils.MODEL_BOOT_PATH  +   \
                                       'CV' + '_t' + strTrial
         self.bootTest   =  utils.MODEL_BOOT_PATH + \
-                                    'test' + '_t' + strTrial  
+                                    'test' + '_t' + strTrial
         self.featTrain  = utils.MODEL_FEATURED_PATH + self.tag + \
                                         '_train' + '_t' + strTrial
         self.featCV     = utils.MODEL_FEATURED_PATH + self.tag + \
                                       '_CV' + '_t' + strTrial
         self.featTest   = utils.MODEL_FEATURED_PATH + self.tag + \
                                     '_test' + '_t' + strTrial
+
         self.tmpTrain   = utils.MODEL_TMP_PATH      + self.tag + \
                                    '_train' + '_t' + strTrial
         self.tmpCV      = utils.MODEL_TMP_PATH      + self.tag + \
                                       '_CV' + '_t' + strTrial
         self.tmpTest    = utils.MODEL_TMP_PATH      + self.tag + \
-                                     '_test'+ '_t' + strTrial 
+                          '_test'+ '_t' + strTrial
         self.runTrain   = utils.MODEL_RUN_PATH      + self.tag + \
                                    '_train' + '_t' + strTrial
         self.runCV      = utils.MODEL_RUN_PATH      + self.tag + \