Permalink
Browse files

1.add neighborhood model

2.the function in neighborhood model can handle different situation
  • Loading branch information...
1 parent 17a8daa commit 41c7a18ea3cbdedafb7bb40487d645f543784118 @Ykid Ykid committed Aug 2, 2013
Showing with 321 additions and 262 deletions.
  1. +2 −1 config.py
  2. +40 −6 utils/SVDModel.py
  3. +0 −255 utils/neighborhood Functions.py
  4. +279 −0 utils/neighborhoodFunctions.py
View
@@ -8,7 +8,8 @@
#['rmt','FM','RelatedMovieTag',['2']]
#['basicSVD','SVD','Basic',[]]
- ['ImplicitFeedbackSVD','SVD','ImplicitFeedback',[]]
+ #['ImplicitFeedbackSVD','SVD','ImplicitFeedback',[]]
+ ['NeighborhoodMovieTag', 'SVD' , 'Neighborhood' , ['MovieTag']]
]
# Defining models:
View
@@ -1,19 +1,20 @@
from Model import Model
import ImplicitFeedbackFunctions as IFF #IFF for implicitFeedbackFunctions
+import neighborhoodFunctions as NF
class SVDModel(Model):
### Construct ###
def __init__(self,configModel,utils,config,strTrial):
- #This function is to set up different parameters only.
- Model.__init__(self,configModel,utils,strTrial) #come from model!it's another class object.
+ #This function is to set up different parameters.
+ Model.__init__(self,configModel,utils,strTrial)
self.configPath = utils.MODEL_CONFIG_PATH + self.tag + \
'_t' + strTrial
- self.userHistoryReindexPath= utils.MODEL_TMP_PATH + self.tag + \
- '_userHistoryReindex' + '_t' + strTrial
-
+
### Baidu Specific ###
### Implicit Feedback Files ###
+ self.userHistoryReindexPath= utils.MODEL_TMP_PATH + self.tag + \
+ '_userHistoryReindex' + '_t' + strTrial
#The following 3 files are implicit feature files
self.ImfeatTrain = utils.MODEL_FEATURED_PATH + self.tag + \
'_Imtrain' + '_t' + strTrial
@@ -32,6 +33,16 @@ def __init__(self,configModel,utils,config,strTrial):
### End Implicit Feature Files ###
self.regularizationFeedback = config.SVD_REGULARIZATION_FEEDBACK
+
+
+ ### Neighborhood Model Files###
+ if self.misc[0] == "MovieTag":
+ self.TagFilePath = self.movieTagPath
+ self.TagFileReindexPath = utils.MODEL_TMP_PATH + self.tag + \
+ '_' + self.misc[0] + '_t' + strTrial
+ self.ShareTagPath = utils.MODEL_TMP_PATH + self.tag + \
+ '_share_' + self.misc[0] + '_t' + strTrial
+ ### End Neighborhood Model Files###
### End Baidu Specific ###
self.numIter = config.SVD_NUM_ITER
@@ -180,7 +191,8 @@ def reIndex(self):
def dataConvert(self):
import os
- if self.featureSet == 'Basic':
+ if self.featureSet == 'Basic' or \
+ self.featureSet == 'Neighborhood':
os.system(self.SVDBufferPath + ' ' +
self.featTrain + ' ' + self.runTrain)
os.system(self.SVDBufferPath + ' ' +
@@ -238,6 +250,8 @@ def setupFeatures(self):
### Baidu Specific Features ###
if self.featureSet == 'ImplicitFeedback':
self.setupImplicitFeatures()
+ if self.featureSet == 'Neighborhood':
+ self.NeighborhoodSetup()
### End Baidu Specific Features ###
def basicConvert(self,fin,fout):
@@ -364,3 +378,23 @@ def setupImplicitFeatures(self):
self.formatType = 1
self.numUserFeedback = len(ItemDic)
+
+
+
+ def NeighborhoodSetup(self):
+ #second
+ NSnoUser,NSnoMovie,NSAvg = NF.reIndex(self.bootTrain, self.TagFilePath, self.bootTest, self.bootCV, self.tmpTrain, self.TagFileReindexPath, self.tmpTest, self.tmpCV)
+
+ #third
+ NF.share(self.TagFileReindexPath,self.ShareTagPath)
+
+ #fourth
+ NumGlobal = NF.neighborhood(self.tmpTrain, self.ShareTagPath, self.tmpTest, self.featTrain, self.featTest)
+ NF.neighborhood(self.tmpCV, self.ShareTagPath, self.tmpTest, self.featCV, self.featTest)
+
+ # set the parameters.
+ self.numUser = NSnoUser
+ self.numMovie = NSnoMovie
+ self.numGlobal = NumGlobal + 1
+ self.avg = NSAvg
+
@@ -1,255 +0,0 @@
-import sys
-#This function is going to use the original training set, original movie set and original prediction set as input
-#The TrainFile is in format of "uid \t mid \t rating"
-#The PredFile is in format of "uid \t mid \t rating"
-#The MovieTagFile is in format of "mid \t tag1,tag2,......."
-#The CVFile is in format of "uid \t mid \t rating"
-#For output
-#The TrainFileReindex is in formate of "uid \t mid \t rating", with reindexed
-#The PredFileReindex is in format of "uid \t mid \t rating",with reindexed
-#The MovieTagFileReindex is in format of "mid \t tag1,tag2,.......", with reindexed
-#The CVFileReindex is in format of "uid \t mid \t rating",with reindexed
-def reIndex(fin,gin,hin,CVfin,fout,gout,hout,CVfout):
-
- TrainFile =open(fin,'r')
- MovieTagFile =open(gin,'r')
- PredFile =open(hin,'r')
- CVFile =open(CVin,'r')
- TrainFileReindex =open(fout,'w')
- MovieTagFileReindex =open(gout,'w')
- PredFileReindex =open(hout,'w')
- CVFileReindex =open(CVfout,'w')
-
- uidDic={} #Key is original uid. Corresponding value is reindexed uid
- midDic={} #Key is original mid. Corresponding value is reindexed mid
- tidDic={} #Key is original tid. Corresponding value is reindexed tid
- mtlDic={} #Key is mid. Correspongding value is a list of the movie's tags
-
- newuid=0
- newmid=0
- newtid=0
-
-#this part is for reindexing trainfile
- for line in TrainFile:
- arr=line.split()
- uid=int(arr[0].strip())
- mid=int(arr[1].strip())
- rating=int(float(arr[2].strip()))
-
- if uid not in uidDic:
- uidDic[uid]=newuid
- newuid+=1
-
- if mid not in midDic:
- midDic[mid]=newmid
- newmid+=1
-
-
- TrainFileReindex.write('%d\t%d\t%d\n' %(uidDic[uid],midDic[mid],rating))
-
-#this part is for reindexing CVfile
- for line in CVFile:
- arr=line.split()
- uid=int(arr[0].strip())
- mid=int(arr[1].strip())
- rating=int(float(arr[2].strip()))
- CVFileReindex.write('%d\t%d\t%d\n' %(uidDic[uid],midDic[mid],rating))
-#this part is for reindexing movie-tag file
- for line in MovieTagFile:
- arr=line.split()
- mid=int(arr[0].strip())
-
- if mid in midDic:
- Tag=(arr[1].strip())
- mtlDic[midDic[mid]]=list()
- TagList=Tag.split(',')
-
- for tid in TagList:
- if tid not in tidDic:
- tidDic[tid]=newtid
- newtid+=1
- mtlDic[midDic[mid]].append(tidDic[tid])
-
- MovieTagFileReindex.write(str(midDic[mid])+'\t')
- for tag in mtlDic[midDic[mid]]:
- MovieTagFileReindex.write(str(tag))
- if tag !=mtlDic[midDic[mid]][-1]:
- MovieTagFileReindex.write(',')
-
- MovieTagFileReindex.write('\n')
-
-
-#this part is for reindexing predicting file
- for line in PredFile:
- arr=line.split()
- uid=int(arr[0].strip())
- mid=int(arr[1].strip())
- rating=int(float(arr[2].strip()))
- PredFileReindex.write('%d\t%d\t%d\n' %(uidDic[uid],midDic[mid],rating))
-
-
- noUser=len(uidDic)
- noMovie=len(midDic)
-
- TrainFileReindex.close()
- MovieTagFileReindex.close()
- PredFileReindex.close()
- CVFileReindex.close()
- TrainFile.close()
- MovieTagFile.close()
- PredFile.close()
- CVFile.close()
- return(noUser,noMovie)
-
-#This function is going to use movie_tag_new.txt to get movie pairs which have certain number of tags in common
-#The input is movie-tag file(after reindexing). The format is "mid \t tag1,tag2,...."
-#The output is in the format of "mid1 \t mid2 \t" Here the mid1 and mid 2 shares enough number of tags in common
-def share(fin,fout):
-
- fi=open(fin,'r')
- fo=open(fout,'w')
-
- mtlDic={}
-
-#this part is going to contruct the dictionary: movie id as key, and tag list as value
- for line in fi:
-
- arr=line.split()
- mid=int(arr[0].strip())
- tag=(arr[1].strip())
- taglist=tag.split(',')
-
- mtlDic[mid]=list()
-
- for tid in taglist:
- mtlDic[mid].append(tid)
-
-
-#this part is for making the file of movie parirs which share enough number of tags
- for mid in mtlDic:
- for i in range(mid+1,len(mtlDic)+1):
- a_set=set(mtlDic[mid])
-
- if i in mtlDic:
- b_set=set(mtlDic[i])
- c_set=a_set.intersection(b_set)
-
- if len(c_set)>=10:
- fo.write('%s\t%s\n' %(mid,i))
-
- print('fuck')
-
- fo.close()
- fi.close()
-
-#This function is going to use user-movie-rating file (both training and testing set), sharing-tag movies pair file as input to get neighbourhoods sets for each given users and movies.
-#TrainingFile is in the format of "uid \t mid \t rating"
-#ShareTag is in the format of "mid1 \t mid2"
-#TeseFile is in the format of "uid \t mid \t rating"
-def neighborhood(fin,gin,hin,fout,gout):
- TrainingFile=open(fin,'r') #refers to the file of user-movie-rating training set
- ShareTag=open(gin,'r') #refers to the file of sharing-tag movie pair
- TestFile=open(hin,'r') #refers to the file of user-movie-rating test set
- TrainingFile_reformated=open(fout,'w') #refers to the transfered format of training set
- TestFile_reformated=open(gout,'w') #refers to the transfered format of text set
-
- TagDic={} #refers to the dictionary of movie and a list which share tags with this movie
- UserMovieDic={} #refers to the dictionary of user-movielist
- RatingDic={} #refers to the dictionary of tuple of user-rating as a key and rating as a value
- AvgDic={} #refers to the dictionary of user and user's average rating
- IndexCorresDic={} #refers to the dictionary of a scalar as a value and movie i and move j as key
-
-#firstly, we make a dictionary of movie and a list inside which movies share tags with the key
-
- for line in ShareTag:
- arr=line.split()
- mid=int(arr[0].strip())
- Newmid=int(arr[1].strip())
-
- if mid not in TagDic:
- TagDic[mid]=list()
- TagDic[mid].append(Newmid)
- else:
- TagDic[mid].append(Newmid)
-
-#then, we get the dictionary of user-movielist and dictionary of tuple of user-rating as a key and rating as value and dictionary fo user and user's average rating
-
- for line in TrainingFile:
- arr=line.split()
- uid=int(arr[0].strip())
- mid=int(arr[1].strip())
- rating=int(float(arr[2].strip()))
- RatingDic[(uid,mid)]=rating
-
- if uid not in UserMovieDic:
- UserMovieDic[uid]=list()
- UserMovieDic[uid].append(mid)
- AvgDic[uid]=rating
- else:
- AvgDic[uid]=(AvgDic[uid]*len(UserMovieDic[uid])+rating)/(len(UserMovieDic[uid])+1)
- UserMovieDic[uid].append(mid)
-
-#now we get the neighborhood of training set
-
- b=0 #total no. of neighbors
- for uid in UserMovieDic:
- for mid in UserMovieDic[uid]:
- TrainingFile_reformated.write('%d\t' %RatingDic[(uid,mid)])
- a=0 #no. of neighbors of that particular movie
- a_list=list()
- if mid in TagDic:
- for movie in TagDic[mid]:
- if movie in UserMovieDic[uid]: # if the user watched that movie in the neighborhood of the mid
- b=b+1
- a=a+1
- c=RatingDic[(uid,movie)]-AvgDic[uid] # c is the distance between rating and AVG
- a_list.append(c)
- IndexCorresDic[(mid,movie)]=b
-
-
- TrainingFile_reformated.write('%d\t1\t1\t' %a)
- for i in range(0,len(a_list)):
- TrainingFile_reformated.write('%d:%f\t' %(b-a+i+1,a_list[i]))
-
-
- else:
- TrainingFile_reformated.write('0\t1\t1\t')
- TrainingFile_reformated.write('%d:1\t%d:1\n' %(uid,mid))
-
-
-#the nwe get the neighborhood of testing set
- for line in TestFile:
- arr=line.split()
- uid=int(arr[0].strip())
- mid=int(arr[1].strip())
- rating=int(float(arr[2].strip()))
- TestFile_reformated.write('%d\t' %rating)
- a=0
- a_list=list()
- b_list=list()
-
- if mid in TagDic:
- for movie in TagDic[mid]:
-
- if (mid,movie) in IndexCorresDic and movie in UserMovieDic[uid]:
- c=RatingDic[(uid,movie)]-AvgDic[uid]
- a_list.append(c)
- m=IndexCorresDic[(mid,movie)]
- b_list.append(m)
- a=a+1
- TestFile_reformated.write('%d\t1\t1\t' %a)
- for i in range(0,len(a_list)):
- TestFile_reformated.write('%d:%f\t' %(b_list[i],a_list[i]))
- else:
- TestFile_reformated.write('0\t1\t1\t')
- TestFile_reformated.write('%d:1\t%d:1\n' %(uid,mid))
-
- print('The number of global feature is %d' %b)
- TrainingFile.close()
- ShareTag.close()
- TestFile.close()
- TrainingFile_reformated.close()
- TestFile_reformated.close()
-
- return(b)
-
Oops, something went wrong.

0 comments on commit 41c7a18

Please sign in to comment.