Skip to content

Commit

Permalink
Merged SVD Implicit Feature
Browse files Browse the repository at this point in the history
  • Loading branch information
ChrisRackauckas committed Aug 1, 2013
1 parent c7a67eb commit 0039fa2
Show file tree
Hide file tree
Showing 7 changed files with 282 additions and 43 deletions.
7 changes: 7 additions & 0 deletions PreProcess/preProcess.py
Expand Up @@ -23,6 +23,9 @@ def preProcess(os,utils,random,DE_EFFECT,userMovieRating,TEST_SUBSET,PROCESS_TAG
p.start()
processes.append(p)

### Baidu Dataset Specific ###
### Preprocesses the Baidu extra features data ###

if PROCESS_TAGS:
print('... Processing Movie Tag Data')
p=mproc.Process(target=processMovieTags,
Expand All @@ -41,9 +44,13 @@ def preProcess(os,utils,random,DE_EFFECT,userMovieRating,TEST_SUBSET,PROCESS_TAG
args=(utils.USER_HISTORY_PATH,utils.PROCESSED_HISTORY,utils.ORIGINAL_DATA_PATH))
p.start()
processes.append(p)


for p in processes:
p.join()

### End Baidu Dataset Specific ###

# De-effects data file
if DE_EFFECT:
deEffectData(utils.ORIGINAL_DATA_CLEAN_PATH,
Expand Down
36 changes: 22 additions & 14 deletions config.py
@@ -1,16 +1,17 @@
################### Select Models ##################

models = [
['basicFM','FM','Basic',['2']],
['basicMovTag','FM','BasicMovieTag',['2']],
['nearNeib', 'FM', 'NearestNeighbor',['2']],
['rmtThresh5t','FM','RelatedMovieTagThreshold',['2']],
['rmtThresh2','FM','RelatedMovieTagThreshold2',['2']],
['userHist','FM','UserHistory',['2']],
['userSocial','FM','UserSocial',['2']]

#['basicSVD','SVD','Basic',[]]
]
['basicFM','FM','Basic',['2']],
#['basicMovTag','FM','BasicMovieTag',['2']],
#['nearNeib', 'FM', 'NearestNeighbor',['2']],
#['rmtThresh5t','FM','RelatedMovieTagThreshold',['2']],
#['rmtThresh2','FM','RelatedMovieTagThreshold2',['2']],
#['userHist','FM','UserHistory',['2']],
#['userSocial','FM','UserSocial',['2']]

['basicSVD','SVD','Basic',[]],
['ImplicitFeedbackSVD','SVD','ImplicitFeedback',[]]
]

# Defining models:
# Each element is a list:
Expand All @@ -28,8 +29,8 @@
['BRT','BRT',[]],
['BMAR','BMAR',[]],
#['RFR' ,'RFR' ,[]], # Large memory requirement
['Lasso', 'Lasso', []],
['GBRT','GBRT',['10']]
['Lasso', 'Lasso', []]
#['GBRT','GBRT',['10']]
]

# Defining ensemble models:
Expand All @@ -42,16 +43,21 @@
synthModel = ['GBRT','GBRT',['10']]

################### Select Parts ##################
LAPTOP_TEST = True # uses small data set to run features on laptop


TRIALS = 1
PRE_PROCESS = True
# ---- ---- PreProcess Selection ---- ---- #
TEST_SUBSET = True # uses small data set
PROCESS_TAGS = True # generates new file for movie tag feature

### Baidu Specific Preprocess ###
PROCESS_TAGS = False # generates new file for movie tag feature
PROCESS_SOCIAL = True # cuts out all the extra social users not in data set
PROCESS_HISTORY = True
DE_EFFECT = False # If De-effect is false, model predictions are correct
### End Baidu Specific ####

DE_EFFECT = False # If De-effect is false, intermittent predictions are correct
# ---- ---- ---- ---- ----- ---- ---- ---- #
SETUP_MODELS = True
RUN_MODELS = True
Expand Down Expand Up @@ -82,6 +88,8 @@
SVD_REGULARIZATION_ITEM = '.004'
SVD_REGULARIZATION_USER = '.004'
SVD_REGULARIZATION_GLOBAL = '.001'

SVD_REGULARIZATION_FEEDBACK = '.004'
SVD_NUM_FACTOR = '64'
SVD_ACTIVE_TYPE = '0'
SVD_NUM_ITER = '40'
Expand Down
3 changes: 3 additions & 0 deletions utils/FMModel.py
Expand Up @@ -123,6 +123,8 @@ def setupFeatures(self):
self.addNearestNeighbor(self.bootCV,self.featCV,moviesRatedByUserDict,movieLocationDict,'CV')
self.addNearestNeighbor(self.bootTest,self.featTest,moviesRatedByUserDict,movieLocationDict,'test')

### Baidu Dataset Specific Features ###

# ---- ---- Movie Tag Features ---- ---- #

elif self.featureSet == 'BasicMovieTag':
Expand Down Expand Up @@ -178,6 +180,7 @@ def setupFeatures(self):
self.userSocial(self.bootCV,self.featCV,userLocationDict,movieLocationDict,userSocialDict,'CV')
self.userSocial(self.bootTest,self.featTest,userLocationDict,movieLocationDict,userSocialDict,'test')

### End Baidu Dataset Specific Features ###

def addNearestNeighbor(self,finPath, foutPath,moviesRatedByUserDict,movieLocationDict,step):
#-----------------------------------------------------------------
Expand Down
116 changes: 116 additions & 0 deletions utils/ImplicitFeedbackFunctions.py
@@ -0,0 +1,116 @@
def reIndex_Implicit(fin):
print("Reindexing Origin Data Set and Building the Correspondence Dics")
fi = open( fin, 'r' ) #training set
#extract from input file
uidDic={}
iidDic={}
newuid=1
newiid=1
ctr=0 # is the counter of the total number.
sum=0.0

for line in fi:
arr = line.split()
uid = int(arr[0].strip())
iid = int(arr[1].strip())
rating = int(float(arr[2].strip()))
#this part for calculating the average
sum+=rating
ctr+=1

#this part for reindexing the user ID
if uid not in uidDic:
uidDic[uid]=newuid
newuid+=1
#this part for reindexing the item ID
if iid not in iidDic:
iidDic[iid]=newiid
newiid+=1

fi.close()
#calculate different parameter.
avg=sum/ctr
print("Finished")
return(uidDic,iidDic,avg)


def translate(fin,fout,Udic,ItemDic):
print("Start Translation. Translating " +fin+" .")
fi=open(fin,'r')
fo=open(fout,'w')
#translate the file
for line in fi:
arr=line.split()
uid=int(arr[0].strip())
iid=int(arr[1].strip())
if len(arr)>2:
rating=str(int(float(arr[2].strip())))
if uid in Udic:
if iid in ItemDic:
if len(arr)>2:
writeline=str(Udic[uid])+'\t'+str(ItemDic[iid])+'\t'+rating+'\r\n'
else:
writeline=str(Udic[uid])+'\t'+str(ItemDic[iid])+'\r\n'
fo.write(writeline)

fi.close()
fo.close()
print("Translation Finished.")


def userfeedback(fname):
fi = open(fname,'r')
feedback = {}
for line in fi:
attr = line.strip().split('\t')
uid = int(attr[0])-1
iid = int(attr[1])-1
if uid in feedback:
feedback[uid].append(iid)
else:
feedback[uid] = [iid]
fi.close()
return feedback

#group num and order of the grouped training data


def usergroup(fname):
fi = open(fname,'r')
userorder = []
groupnum = {}
lastuid = -1
for line in fi:
attr = line.strip().split('\t')
uid = int(attr[0])-1
if uid in groupnum:
groupnum[uid] += 1
else:
groupnum[uid] = 1
if uid != lastuid:
userorder.append(uid)
lastuid = uid
fi.close()
return userorder,groupnum

#make implict feedback feature, one line for a user, wihch is in the order of the grouped training data
#the output format:rate \t number of user group \t number of user implicit feedback \t fid1:fvalue1, fid2:fvalue2 ... \n


def mkfeature(fout,userorder,groupnum,feedback):
fo = open(fout,'w')
for uid in userorder:
gnum = groupnum[uid]
fnum = len(feedback[uid])
fo.write('%d\t%d\t' %(gnum,fnum))
for i in feedback[uid]:
fo.write('%d:%.6f ' %(i,pow(fnum,-0.5)))
fo.write('\n')


def mkImplicitFeatureFile(ftrain,fgtrain,fout):
'''usage:<training_file> <grouped training_file> <output>'''
feedback = userfeedback(ftrain)
userorder,groupnum = usergroup(fgtrain)
#make features and print them out in file fout
mkfeature(fout,userorder,groupnum,feedback)
5 changes: 3 additions & 2 deletions utils/Model.py
Expand Up @@ -10,19 +10,20 @@ def __init__(self,configModel,utils,strTrial):
self.bootCV = utils.MODEL_BOOT_PATH + \
'CV' + '_t' + strTrial
self.bootTest = utils.MODEL_BOOT_PATH + \
'test' + '_t' + strTrial
'test' + '_t' + strTrial
self.featTrain = utils.MODEL_FEATURED_PATH + self.tag + \
'_train' + '_t' + strTrial
self.featCV = utils.MODEL_FEATURED_PATH + self.tag + \
'_CV' + '_t' + strTrial
self.featTest = utils.MODEL_FEATURED_PATH + self.tag + \
'_test' + '_t' + strTrial

self.tmpTrain = utils.MODEL_TMP_PATH + self.tag + \
'_train' + '_t' + strTrial
self.tmpCV = utils.MODEL_TMP_PATH + self.tag + \
'_CV' + '_t' + strTrial
self.tmpTest = utils.MODEL_TMP_PATH + self.tag + \
'_test'+ '_t' + strTrial
'_test'+ '_t' + strTrial
self.runTrain = utils.MODEL_RUN_PATH + self.tag + \
'_train' + '_t' + strTrial
self.runCV = utils.MODEL_RUN_PATH + self.tag + \
Expand Down

0 comments on commit 0039fa2

Please sign in to comment.