Permalink
Browse files

modify implicit feedback to adapt to different file type

  • Loading branch information...
1 parent 41c7a18 commit 339145be1ab2066c8276a20a80b7ca5c3e90cbfd @Ykid Ykid committed Aug 2, 2013
Showing with 94 additions and 73 deletions.
  1. +2 −2 config.py
  2. +80 −12 utils/ImplicitFeedbackFunctions.py
  3. +12 −59 utils/SVDModel.py
View
@@ -8,8 +8,8 @@
#['rmt','FM','RelatedMovieTag',['2']]
#['basicSVD','SVD','Basic',[]]
- #['ImplicitFeedbackSVD','SVD','ImplicitFeedback',[]]
- ['NeighborhoodMovieTag', 'SVD' , 'Neighborhood' , ['MovieTag']]
+ ['ImplicitFeedbackSVD','SVD','ImplicitFeedback',[]]
+ #['NeighborhoodMovieTag', 'SVD' , 'Neighborhood' , ['MovieTag']]
]
# Defining models:
@@ -13,19 +13,36 @@
the output format: rate \t number of user group \t number of user implicit feedback \t fid1:fvalue1, fid2:fvalue2 ... \n
'''
-def reIndex_Implicit(fin):
- print("Reindexing Origin Data Set and Building the Correspondence Dics")
- fi = open( fin, 'r' ) #training set
- #extract from input file
+def reIndex_Implicit(ftrain,fCV,ftest,ftrainOut,fCVOut,ftestOut):
+ print("Reindexing Data Sets and Building the Correspondence Dics")
+ bootTrainFile = open(ftrain, 'r')
+ bootCVFile = open(fCV , 'r')
+ bootTestFile = open(ftest , 'r')
+ tmpTrainFile = open(ftrainOut, 'w')
+ tmpTestFile = open(ftestOut, 'w')
+ tmpCVFile = open(fCVOut, 'w')
+
+ ############# Write tmp file reindexed ###############3
+ trainLines = bootTrainFile.readlines()
+ CVLines = bootCVFile.readlines()
+ testLines = bootTestFile.readlines()
+
+ fullInput = []
+ fullInput.append(trainLines)
+ fullInput.append(CVLines)
+ fullInput.append(testLines)
+
uidDic={}
iidDic={}
newuid=1
newiid=1
ctr=0 # is the counter of the total number.
sum=0.0
- for line in fi:
- arr = line.split()
+ #Build dictionary
+
+ for line in trainLines:
+ arr = line.rsplit('\t')
uid = int(arr[0].strip())
iid = int(arr[1].strip())
rating = int(float(arr[2].strip()))
@@ -41,10 +58,61 @@ def reIndex_Implicit(fin):
if iid not in iidDic:
iidDic[iid]=newiid
newiid+=1
-
- fi.close()
- #calculate different parameter.
+
+ for line in CVLines:
+ arr = line.rsplit('\t')
+ uid = int(arr[0].strip())
+ iid = int(arr[1].strip())
+ #this part for reindexing the user ID
+ if uid not in uidDic:
+ uidDic[uid]=newuid
+ newuid+=1
+ #this part for reindexing the item ID
+ if iid not in iidDic:
+ iidDic[iid]=newiid
+ newiid+=1
+
+ for line in testLines:
+ arr = line.rsplit('\t')
+ uid = int(arr[0].strip())
+ iid = int(arr[1].strip())
+ #this part for reindexing the user ID
+ if uid not in uidDic:
+ uidDic[uid]=newuid
+ newuid+=1
+ #this part for reindexing the item ID
+ if iid not in iidDic:
+ iidDic[iid]=newiid
+ newiid+=1
+
+ #Re-index
+ for line in trainLines:
+ arr = line.split()
+ uid = int(arr[0].strip())
+ iid = int(arr[1].strip())
+ rating = int(float(arr[2].strip()))
+ tmpTrainFile.write('%d\t%d\t%d\n' %(uidDic[uid],iidDic[iid],rating))
+ for line in CVLines:
+ arr = line.split()
+ uid = int(arr[0].strip())
+ iid = int(arr[1].strip())
+ rating = int(float(arr[2].strip()))
+ tmpCVFile.write('%d\t%d\t%d\n' %(uidDic[uid],iidDic[iid],rating))
+ for line in testLines:
+ arr = line.split()
+ uid = int(arr[0].strip())
+ iid = int(arr[1].strip())
+ rating = int(float(arr[2].strip()))
+ tmpTestFile.write('%d\t%d\t%d\n' %(uidDic[uid],iidDic[iid],rating))
+
avg=sum/ctr
+ #Close files
+ bootTrainFile.close()
+ bootTestFile.close()
+ bootCVFile.close()
+ tmpTrainFile.close()
+ tmpTestFile.close()
+ tmpCVFile.close()
print("Finished")
return(uidDic,iidDic,avg)
@@ -77,8 +145,8 @@ def userfeedback(fname):
feedback = {}
for line in fi:
attr = line.strip().split('\t')
- uid = int(attr[0])-1
- iid = int(attr[1])-1
+ uid = int(attr[0])-1#uid actually start from 0
+ iid = int(attr[1])-1#mid actually start from 0
if uid in feedback:
feedback[uid].append(iid)
else:
@@ -94,7 +162,7 @@ def usergroup(fname):
lastuid = -1
for line in fi:
attr = line.strip().split('\t')
- uid = int(attr[0])-1
+ uid = int(attr[0])-1 #uid actually start from 0
if uid in groupnum:
groupnum[uid] += 1
else:
View
@@ -36,12 +36,13 @@ def __init__(self,configModel,utils,config,strTrial):
### Neighborhood Model Files###
- if self.misc[0] == "MovieTag":
- self.TagFilePath = self.movieTagPath
- self.TagFileReindexPath = utils.MODEL_TMP_PATH + self.tag + \
- '_' + self.misc[0] + '_t' + strTrial
- self.ShareTagPath = utils.MODEL_TMP_PATH + self.tag + \
- '_share_' + self.misc[0] + '_t' + strTrial
+ if len(self.misc) > 0:
+ if self.misc[0] == "MovieTag":
+ self.TagFilePath = self.movieTagPath
+ self.TagFileReindexPath = utils.MODEL_TMP_PATH + self.tag + \
+ '_' + self.misc[0] + '_t' + strTrial
+ self.ShareTagPath = utils.MODEL_TMP_PATH + self.tag + \
+ '_share_' + self.misc[0] + '_t' + strTrial
### End Neighborhood Model Files###
### End Baidu Specific ###
@@ -61,6 +62,10 @@ def __init__(self,configModel,utils,config,strTrial):
self.SVDFeatureSVDPPRandOrder = utils.SVDFEATURE_SVDPP_RANDORDER
self.formatType = 0
self.numUserFeedback = 0
+ self.numUser= 0
+ self.numMovie= 0
+ self.numGlobal = 0
+ self.avg= 0
self.originDataSet = utils.ORIGINAL_DATA_PATH
# 0 is the default value
@@ -271,17 +276,10 @@ def basicConvert(self,fin,fout):
def setupImplicitFeatures(self):
import os
-
#reindex the training files and build two dicts
- Udic,ItemDic,avg=IFF.reIndex_Implicit(self.originDataSet)
+ Udic,ItemDic,avg=IFF.reIndex_Implicit(self.bootTrain, self.bootCV, self.bootTest, self.tmpTrain, self.tmpCV, self.tmpTest)
#reindex the history
IFF.translate(self.userHistoryPath, self.userHistoryReindexPath, Udic, ItemDic)
- #reindex CV file
- IFF.translate(self.bootCV, self.tmpCV, Udic, ItemDic)
- #reindex Testfile
- IFF.translate(self.bootTest, self.tmpTest, Udic, ItemDic)
- #reindex the training
- IFF.translate(self.bootTrain,self.tmpTrain,Udic,ItemDic)
#make group training files
os.system(self.SVDFeatureSVDPPRandOrder +' '+ self.tmpTrain + ' ' + self.tmpLineOrder)
@@ -335,51 +333,6 @@ def fixRun(self):
self.prependUserMovieToPredictions(self.bootCV,self.predCVTmp,self.predCV)
self.prependUserMovieToPredictions(self.bootTest,self.predTestTmp,self.predTest)
- def setupImplicitFeatures(self):
- import os
- #reindex the training files and build two dicts
- Udic,ItemDic,avg=IFF.reIndex_Implicit(self.originDataSet)
- #reindex the history
- IFF.translate(self.userHistoryPath, self.userHistoryReindexPath, Udic, ItemDic)
- #reindex CV file
- IFF.translate(self.bootCV, self.tmpCV, Udic, ItemDic)
- #reindex Testfile
- IFF.translate(self.bootTest, self.tmpTest, Udic, ItemDic)
- #reindex the training files
- IFF.translate(self.bootTrain,self.tmpTrain,Udic,ItemDic)
-
- #make group training files
- os.system(self.SVDFeatureSVDPPRandOrder +' '+ self.tmpTrain + ' ' + self.tmpLineOrder)
- os.system(self.SVDFeatureLineReorder + ' ' + self.tmpTrain + ' ' + self.tmpLineOrder + ' ' + self.tmpGpTrain)
-
- #make group training files of the CV set
- os.system(self.SVDFeatureSVDPPRandOrder +' '+ self.tmpCV + \
- ' '+ self.tmpLineOrder)
- os.system(self.SVDFeatureLineReorder + ' ' + self.tmpCV + \
- ' ' + self.tmpLineOrder + ' ' + self.tmpGpCV)
-
- #make basic feature files
- self.basicConvert(self.tmpGpTrain,self.featTrain)
- self.basicConvert(self.tmpGpCV, self.featCV)
- self.basicConvert(self.tmpTest, self.featTest)
-
- #make implicit feature files
- IFF.mkImplicitFeatureFile(self.userHistoryReindexPath,self.tmpGpTrain,self.ImfeatTrain)
- IFF.mkImplicitFeatureFile(self.userHistoryReindexPath,self.tmpTest,self.ImfeatTest)
- IFF.mkImplicitFeatureFile(self.userHistoryReindexPath,self.tmpGpCV,self.ImfeatCV)
-
-
- #set different parameters
- self.numUser=len(Udic)
- self.numMovie=len(ItemDic)
- self.avg=avg
- self.numGlobal = 0
- self.activeType = '0'
- self.formatType = 1
- self.numUserFeedback = len(ItemDic)
-
-
-
def NeighborhoodSetup(self):
#second

0 comments on commit 339145b

Please sign in to comment.