In [1]:
import gzip
import numpy as np
from collections import defaultdict

def readGz(f):
    for l in gzip.open(f):
        yield eval(l)

### Rating baseline: compute averages for each user, or return the global average if we've never seen the user before

allRatings = []
userRatings = defaultdict(list)
for l in readGz("train.json.gz"):
    user,business = l['userID'],l['businessID']
    allRatings.append(l['rating'])
    userRatings[user].append(l['rating'])

globalAverage = sum(allRatings) / len(allRatings)
userAverage = {}
for u in userRatings:
    userAverage[u] = sum(userRatings[u]) / len(userRatings[u])

predictions = open("predictions_Rating.txt", 'w')
for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if u in userAverage:
        predictions.write(u + '-' + i + ',' + str(userAverage[u]) + '\n')
    else:
        predictions.write(u + '-' + i + ',' + str(globalAverage) + '\n')

predictions.close()

In [2]:
### Would-visit baseline: just rank which businesses are popular and which are not, and return '1' if a business is among the top-ranked
businessCount = defaultdict(int)
totalPurchases = 0

for l in readGz("train.json.gz"):
    user,business = l['userID'],l['businessID']
    businessCount[business] += 1
    totalPurchases += 1

mostPopular = [(businessCount[x], x) for x in businessCount]
mostPopular.sort()
mostPopular.reverse()

return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPurchases/2: break

predictions = open("predictions_Visit.txt", 'w')
for l in open("pairs_Visit.txt"):
    if l.startswith("userID"):
    #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    if i in return1:
        predictions.write(u + '-' + i + ",1\n")
    else:
        predictions.write(u + '-' + i + ",0\n")

predictions.close()

In [3]:
### Category prediction baseline: Just consider some of the most common words from each category

catDict = {
  "American Restaurant": 0,
  "Bar": 1,
  "Asian Restaurant": 2,
  "European Restaurant": 3,
  "Italian Restaurant": 4,
  "Fast Food Restaurant": 5,
  "Mexican Restaurant": 6,
  "Seafood Restaurant": 7,
  "Coffee Shop": 8,
  "Sandwich Shop": 9
}

predictions = open("predictions_Category.txt", 'w')
predictions.write("userID-reviewHash,category\n")
for l in readGz("test_Category.json.gz"):
    cat = catDict['American Restaurant'] # If there's no evidence, just choose the most common category in the dataset
    words = l['reviewText'].lower()
    if 'america' in words:
        cat = catDict['American Restaurant']
    if 'bar' in words or 'beer' in words:
        cat = catDict['Bar']
    if 'asia' in words:
        cat = catDict['Asian Restaurant']
    if 'europe' in words:
        cat = catDict['European Restaurant']
    if 'italian' in words:
        cat = catDict['Italian Restaurant']
    if 'fast' in words:
        cat = catDict['Fast Food Restaurant']
    if 'mexic' in words:
        cat = catDict['Mexican Restaurant']
    if 'coffee' in words:
        cat = catDict['Coffee Shop']
    if 'sandwich' in words:
        cat = catDict['Sandwich Shop']
    predictions.write(l['userID'] + '-' + l['reviewHash'] + "," + str(cat) + "\n")

predictions.close()


In [2]:
########Q1
data=list(readGz("train.json.gz"))
trainset=data[0:100000]
validationset=data[100000:200000]

In [3]:
###baseline 
businessCount = defaultdict(int)
userCount=defaultdict(int)
totalPurchases = 0

for l in readGz("train.json.gz"):
    user,business = l['userID'],l['businessID']
    businessCount[business] += 1
    userCount[user]+=1
    totalPurchases += 1

mostPopular = [(businessCount[x], x) for x in businessCount]
mostPopular.sort()
mostPopular.reverse()

In [4]:
return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPurchases/2: break

In [5]:
data1=[]
temp=[]
for l in data:
    dic={}
    user,business = l['userID'],l['businessID']
    dic['userID']=l['userID']
    dic['businessID']=l['businessID']
    dic['isvisit']=1
    data1.append(dic)
    temp.append([user,business ])

In [6]:
businesslist = []
userlist=[]
for l in data:
    user,business = l['userID'],l['businessID']
    businesslist.append(business)
    userlist.append(user)
businesslist=list(set(businesslist))  
userlist=list(set(userlist))    

In [8]:
###random create validation set
import random
validset=[]
while len(validset)<100000:
    dic={}
    i=random.randint(0,len(userlist)-1)
    user=userlist[i]
    j=random.randint(0,len(businesslist)-1)
    business=businesslist[j]
    temp1=[user,business]
    if temp1 not in temp:
        dic['userID']=user
        dic['businessID']=business
        dic['isvisit']=0
        validset.append(dic)
#combine validset
valid=data1[100000:200000]+validset        
    

In [11]:
##calculate accuracy of baseline on validation set
num=0
for l in valid:
    u=l['businessID']
    if u in return1:
        y=1
    else:
        y=0
    if y==l['isvisit']:  #count the correct pairs
        num+=1
acc=num*1.0/len(valid)
print "when threshold is 50th percentile"
print "accuracy is",acc

when threshold is 50th percentile
accuracy is 0.650585


In [30]:
# change threshold from 50th percentile of popularity to 67th percentile of popularity
return1 = set()
count = 0
for ic, i in mostPopular:
    count += ic
    return1.add(i)
    if count > totalPurchases/1.5: break
##calculate accuracy of baseline on validation set
num=0
for l in valid:
    u=l['businessID']
    if u in return1:
        y=1
    else:
        y=0
    if y==l['isvisit']:  #count the correct pairs
        num+=1
acc=num*1.0/len(valid)
print "when threshold is 67th percentile"
print "accuracy is",acc

when threshold is 67th percentile
accuracy is 0.653975


I change the threshold from 50th percentile of popularity to 67th percentile of popularity. The accuracy slightly increases.

In [7]:
#####Q3
usercategorydic=defaultdict(list)
categorydic = defaultdict(list)
for l in data:
    user,business,categories = l['userID'],l['businessID'],l['categories']
    usercategorydic[user].append(categories)
    categorydic[business]=(categories)
 

In [8]:
y=[1]*100000+[0]*100000 #create y for valid set
y_pred=[0]*len(valid)
for i,l in enumerate(valid):
    user,business = l['userID'],l['businessID']
    #print user,business
    if type(usercategorydic[user])!=int and type(categorydic[business])!=int:
        for cat in (categorydic[business]):
            for i in range(len(usercategorydic[user])):
                if cat in usercategorydic[user][i]:
                    y_pred[i]=1  #calculate y_pred
                

NameError: name 'valid' is not defined

In [22]:
#calculate accuracy
correct = [(a==b) for (a,b) in zip(y_pred,y)]
acc = sum(correct) * 1.0 / len(correct)
print "accuracy is",acc


accuracy is 0.502855


In [24]:
#####Q4 input the model to pairs_Visit.txt and submit on Kaggle
predictions = open("predictions_Visit.txt", 'w')
c=0
for l in open("pairs_Visit.txt"):
    if l.startswith("userID"):
    #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    y=0
    #if type(usercategorydic[u])!=int and type(categorydic[i])!=int:
    for cat in (categorydic[i]):
        for j in range(len(usercategorydic[u])):
            if cat in usercategorydic[u][j]:
                 y=1
    if y==1:
        predictions.write(u + '-' + i + ",1\n")
    else:
        predictions.write(u + '-' + i + ",0\n")
    c+=1

predictions.close()

My Kaggle name is Chao Yu

In [9]:
####### Q5
import pandas as pd
sumofrating=0
for l in trainset:
    rating=l['rating']
    sumofrating+=rating
alpha=sumofrating/len(trainset)
print "alpha is",alpha

alpha is 4.18703


In [10]:
import math
MSE=0
for l in validationset:
    rating=l['rating']
    MSE+=(alpha-rating)**2
MSE=MSE/len(validationset)
print "MSE on the validation set is",MSE

MSE on the validation set is 0.748343744499


In [11]:
businesslist = []
userlist=[]
for l in validationset:
    user,business = l['userID'],l['businessID']
    businesslist.append(business)
    userlist.append(user)
businesslist=list(set(businesslist))  
userlist=list(set(userlist))    

In [96]:
#######Q6
import random
df=pd.DataFrame(trainset)
userRatings_train = defaultdict(lambda: defaultdict(int))
businessRatings_train = defaultdict(lambda: defaultdict(int))
beta_u = defaultdict(float)
beta_i = defaultdict(float)
gamma_u= defaultdict(float)##gamma
gamma_i= defaultdict(float)##gamma

for l in trainset:
    user,business,rating = l['userID'],l['businessID'],l['rating']
    userRatings_train[user][business] = rating
    businessRatings_train[business][user] = rating
    #random initialization
    beta_u[user] = random.randint(1,100)/1000.0  
    beta_i[business] = random.randint(1,100)/1000.0
    gamma_u[user] = random.randint(1,100)/1000.0  ##gamma
    gamma_i[business] = random.randint(1,100)/1000.0  ###gamma


  

In [58]:
df=pd.DataFrame(validationset)
sum(df["rating"])


418194.0

In [44]:
df=pd.DataFrame(data)

sum(df["rating"])


836897.0

In [60]:
lamda = 1
iteration = 0
while iteration <= 100:
    # update alpha
    
    alpha=sum(df["rating"])
    for user in userRatings_train:
        for business in userRatings_train[user]:
            alpha -= (beta_u[user] + beta_i[business])
    alpha /= len(trainset)
    # update beta_u
    for user in userRatings_train:
        beta_u[user] = 0
        for business in userRatings_train[user]:
            beta_u[user] += userRatings_train[user][business] \
                    - (alpha + beta_i[business])
        beta_u[user] /= (lamda + len(userRatings_train[user])) 
    # update beta_i
    for business in businessRatings_train:
        beta_i[business] = 0
        for user in businessRatings_train[business]:
            beta_i[business] += businessRatings_train[business][user] \
                    - (alpha + beta_u[user])
        beta_i[business] /= (lamda + len(businessRatings_train[business]))
    # update gamma_u
    iteration += 1

In [61]:
alpha

4.2247239894318023

In [64]:
###add beta gamma
lamda = 1
iteration = 0
while iteration <= 100:
    # update alpha
    alpha=sum(df["rating"])
    for user in userRatings_train:
        for business in userRatings_train[user]:
            alpha -= (beta_u[user] + beta_i[business]+gamma_u[user]*gamma_i[business])
    alpha /= len(trainset)
    # update beta_u
    for user in userRatings_train:
        beta_u[user] = 0
        for business in userRatings_train[user]:
            beta_u[user] += userRatings_train[user][business] \
                    - (alpha + beta_i[business]+gamma_u[user]*gamma_i[business])
        beta_u[user] /= (lamda + len(userRatings_train[user])) 
    # update beta_i
    for business in businessRatings_train:
        beta_i[business] = 0
        for user in businessRatings_train[business]:
            beta_i[business] += businessRatings_train[business][user] \
                    - (alpha + beta_u[user])
        beta_i[business] /= (lamda + len(businessRatings_train[business]))
    # update gamma_u
    for user in userRatings_train:
        gamma_u[user]=0
        sumofgamma_i=0
        for business in userRatings_train[user]:
            sumofgamma_i+=gamma_i[business]**2
            gamma_u[user] += (userRatings_train[user][business] \
                -(alpha + beta_i[business]+beta_u[business]))*gamma_i[business]
            gamma_u[user]/=(lamda+sumofgamma_i)
    # update gamma_i
    for business in businessRatings_train:
        gamma_i[business]=0
        sumofgamma_u=0
        for user in businessRatings_train[business]:
            sumofgamma_u+=gamma_u[user]**2
            gamma_i[business] += (userRatings_train[user][business] \
                -(alpha + beta_i[business]+beta_u[business]))*gamma_u[user]
            gamma_i[business]/=(lamda+sumofgamma_u)
    iteration += 1

In [62]:
### calculate MSE on validation set
userRatings_valid = defaultdict(lambda: defaultdict(int))
businessRatings_valid = defaultdict(lambda: defaultdict(int))

for l in validationset:
    user,business,rating = l['userID'],l['businessID'],l['rating']
    userRatings_valid[user][business] = rating
    businessRatings_valid[business][user] = rating
MSE = 0
for user in userRatings_valid:
    for business in userRatings_valid[user]:
        x=alpha
        if user in beta_u:
            x+=beta_u[user]
        if business in beta_i:
            x+=beta_i[business]
        MSE+=((x-userRatings_valid[user][business])**2)

MSE /= len(validationset)
print "MSE on validation set",MSE


MSE on validation set 0.645692842023


In [65]:
### calculate MSE on validation set with gamma
userRatings_valid = defaultdict(lambda: defaultdict(int))
businessRatings_valid = defaultdict(lambda: defaultdict(int))

for l in validationset:
    user,business,rating = l['userID'],l['businessID'],l['rating']
    userRatings_valid[user][business] = rating
    businessRatings_valid[business][user] = rating
MSE = 0
for user in userRatings_valid:
    for business in userRatings_valid[user]:
        x=alpha
        if user in beta_u:
            x+=beta_u[user]
        if business in beta_i:
            x+=beta_i[business]
        if business in gamma_i and user in gamma_u:
            x+=gamma_i[business]*gamma_u[user]
        MSE+=((x-userRatings_valid[user][business])**2)

MSE /= len(validationset)
print "MSE on validation set",MSE

MSE on validation set 0.644908729734


In [63]:
####Q7
import operator
sorted_beta_u=sorted(beta_u.items(), key=operator.itemgetter(1))
sorted_beta_i=sorted(beta_i.items(), key=operator.itemgetter(1))
print "user id with maximum beta is",sorted_beta_u[-1][0]
print "maximum beta is",sorted_beta_u[-1][1]
print "user id with minimum beta is",sorted_beta_u[0][0]
print "minimum beta is",sorted_beta_u[0][1]
print "business id with maximum beta is",sorted_beta_i[-1][0]
print "maximum beta is",sorted_beta_i[-1][1]
print "business id with minimum beta is",sorted_beta_i[0][0]
print "minimum beta is",sorted_beta_i[0][1]

user id with maximum beta is U357799541
maximum beta is 1.16295334636
user id with minimum beta is U417838537
minimum beta is -2.83446271589
business id with maximum beta is B093985406
maximum beta is 1.16839558382
business id with minimum beta is B241777680
minimum beta is -2.23561053847


In [97]:
lamdaset=[4.15,4.1]
for lamda in lamdaset:
    iteration = 0
    while iteration <= 100:
    # update alpha
        alpha=sum(df["rating"])
        for user in userRatings_train:
            for business in userRatings_train[user]:
                alpha -= (beta_u[user] + beta_i[business])
        alpha /= len(trainset)
        # update beta_u
        for user in userRatings_train:
            beta_u[user] = 0
            for business in userRatings_train[user]:
                beta_u[user] += userRatings_train[user][business] \
                        - (alpha + beta_i[business])
            beta_u[user] /= (lamda + len(userRatings_train[user])) 
        # update beta_i
        for business in businessRatings_train:
            beta_i[business] = 0
            for user in businessRatings_train[business]:
                beta_i[business] += businessRatings_train[business][user] \
                        - (alpha + beta_u[user])
            beta_i[business] /= (lamda + len(businessRatings_train[business]))
        # MSE
        MSE = 0
        for user in userRatings_train:
            for business in userRatings_train[user]:
                MSE += (alpha + beta_u[user] + beta_i[business] 
                        - userRatings_train[user][business]) **2
        MSE /= len(trainset)
        if iteration==100:
            print "lamda is", lamda
            print "MSE for training set is", MSE
        iteration+=1
    ### calculate MSE on validation set
    userRatings_valid = defaultdict(lambda: defaultdict(int))
    businessRatings_valid = defaultdict(lambda: defaultdict(int))

    for l in validationset:
        user,business,rating = l['userID'],l['businessID'],l['rating']
        userRatings_valid[user][business] = rating
        businessRatings_valid[business][user] = rating
    MSE = 0
    for user in userRatings_valid:
        for business in userRatings_valid[user]:
            x=alpha
            if user in beta_u:
                x+=beta_u[user]
            if business in beta_i:
                x+=beta_i[business]
            MSE+=(x-userRatings_valid[user][business])**2

    MSE /= len(validationset)
    print "MSE for validation set is",MSE
  

lamda is 4.15
MSE for training set is 0.410408887916
MSE for validation set is 0.607999541742
lamda is 4.1
MSE for training set is 0.409451883308
MSE for validation set is 0.607994406249


In [89]:
lamdaset=[4.2,4.1]
for lamda in lamdaset:
    ###add beta gamma
    iteration = 0
    while iteration <= 100:
        # update alpha
        alpha=sum(df["rating"])
        for user in userRatings_train:
            for business in userRatings_train[user]:
                alpha -= (beta_u[user] + beta_i[business]+gamma_u[user]*gamma_i[business])
        alpha /= len(trainset)
        # update beta_u
        for user in userRatings_train:
            beta_u[user] = 0
            for business in userRatings_train[user]:
                beta_u[user] += userRatings_train[user][business] \
                        - (alpha + beta_i[business]+gamma_u[user]*gamma_i[business])
            beta_u[user] /= (lamda + len(userRatings_train[user])) 
        # update beta_i
        for business in businessRatings_train:
            beta_i[business] = 0
            for user in businessRatings_train[business]:
                beta_i[business] += businessRatings_train[business][user] \
                        - (alpha + beta_u[user])
            beta_i[business] /= (lamda + len(businessRatings_train[business]))
        # update gamma_u
        for user in userRatings_train:
            gamma_u[user]=0
            sumofgamma_i=0
            for business in userRatings_train[user]:
                sumofgamma_i+=gamma_i[business]**2
                gamma_u[user] += (userRatings_train[user][business] \
                    -(alpha + beta_i[business]+beta_u[business]))*gamma_i[business]
                gamma_u[user]/=(lamda+sumofgamma_i)
        # update gamma_i
        for business in businessRatings_train:
            gamma_i[business]=0
            sumofgamma_u=0
            for user in businessRatings_train[business]:
                sumofgamma_u+=gamma_u[user]**2
                gamma_i[business] += (userRatings_train[user][business] \
                    -(alpha + beta_i[business]+beta_u[business]))*gamma_u[user]
                gamma_i[business]/=(lamda+sumofgamma_u)
        # MSE
        MSE = 0
        for user in userRatings_train:
            for business in userRatings_train[user]:
                MSE += (alpha + beta_u[user] + beta_i[business]+gamma_u[user]*gamma_i[business]\
                        - userRatings_train[user][business]) **2
        MSE /= len(trainset)
        if iteration==100:
            print "lamda is", lamda
            print "MSE for training set is", MSE
        iteration+=1
    ### calculate MSE on validation set
    userRatings_valid = defaultdict(lambda: defaultdict(int))
    businessRatings_valid = defaultdict(lambda: defaultdict(int))

    for l in validationset:
        user,business,rating = l['userID'],l['businessID'],l['rating']
        userRatings_valid[user][business] = rating
        businessRatings_valid[business][user] = rating
    MSE = 0
    for user in userRatings_valid:
        for business in userRatings_valid[user]:
            x=alpha
            if user in beta_u:
                x+=beta_u[user]
            if business in beta_i:
                x+=beta_i[business]
            if business in gamma_i and user in gamma_u:
                x+=gamma_i[business]*gamma_u[user]
            
            MSE+=(x-userRatings_valid[user][business])**2

    MSE /= len(validationset)
    print "MSE for validation set is",MSE
  

lamda is 4.2
MSE for training set is 0.411358466701
MSE for validation set is 0.608009992471
lamda is 4.1
MSE for training set is 0.409451883308
MSE for validation set is 0.607994406249


We can see when lamda is 10, the MSE for validation set is lowest, which is 0.621537959215.

In [25]:
lamda = 7
iteration = 0
while iteration <= 100:
    # update alpha
    
    alpha=sum(df["rating"])
    for user in userRatings_train:
        for business in userRatings_train[user]:
            alpha -= (beta_u[user] + beta_i[business])
    alpha /= len(trainset)
    # update beta_u
    for user in userRatings_train:
        beta_u[user] = 0
        for business in userRatings_train[user]:
            beta_u[user] += userRatings_train[user][business] \
                    - (alpha + beta_i[business])
        beta_u[user] /= (lamda + len(userRatings_train[user])) 
    # update beta_i
    for business in businessRatings_train:
        beta_i[business] = 0
        for user in businessRatings_train[business]:
            beta_i[business] += businessRatings_train[business][user] \
                    - (alpha + beta_u[user])
        beta_i[business] /= (lamda + len(businessRatings_train[business]))
    # MSE
    MSE = 0
    for user in userRatings_train:
        for business in userRatings_train[user]:
            MSE += (alpha + beta_u[user] + beta_i[business] 
                    - userRatings_train[user][business]) **2
    MSE /= len(trainset)
    iteration += 1


In [39]:
predictions = open("predictions_Rating.txt", 'w')
for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    x=alpha
    if u in beta_u:
        x += beta_u[u]
    if i in beta_i:
        x += beta_i[i] 
        
    predictions.write(u + '-' + i + ',' + str(x) + '\n')
    
predictions.close()

In [91]:
userRatings = defaultdict(lambda: defaultdict(int))
businessRatings = defaultdict(lambda: defaultdict(int))
beta_u = defaultdict(float)
beta_i = defaultdict(float)
gamma_u= defaultdict(float)##gamma
gamma_i= defaultdict(float)##gamma

for l in data:
    user,business,rating = l['userID'],l['businessID'],l['rating']
    userRatings[user][business] = rating
    businessRatings[business][user] = rating
    #random initialization
    beta_u[user] = random.randint(1,100)/1000.0  
    beta_i[business] = random.randint(1,100)/1000.0
    gamma_u[user] = random.randint(1,100)/1000.0  ##gamma
    gamma_i[business] = random.randint(1,100)/1000.0  ###gamma

In [39]:
###### Use all data to make prediction
df=pd.DataFrame(data)
lamda = 4
iteration = 0
while iteration <= 200:
    # update alpha
    
    alpha=sum(df["rating"])
    for user in userRatings:
        for business in userRatings[user]:
            alpha -= (beta_u[user] + beta_i[business])
    alpha /= len(data)
    # update beta_u
    for user in userRatings:
        beta_u[user] = 0
        for business in userRatings[user]:
            beta_u[user] += userRatings[user][business] \
                    - (alpha + beta_i[business])
        beta_u[user] /= (lamda + len(userRatings[user])) 
    # update beta_i
    for business in businessRatings:
        beta_i[business] = 0
        for user in businessRatings[business]:
            beta_i[business] += businessRatings[business][user] \
                    - (alpha + beta_u[user])
        beta_i[business] /= (lamda + len(businessRatings[business]))
    iteration += 1

In [41]:
predictions = open("predictions_Rating.txt", 'w')
for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    x=alpha
    if u in beta_u:
        x += beta_u[u]
    if i in beta_i:
        x += beta_i[i] 
        
    predictions.write(u + '-' + i + ',' + str(x) + '\n')
    
predictions.close()

In [92]:
###### Use all data to make prediction gamma
df=pd.DataFrame(data)
lamda = 4.1
iteration = 0
while iteration <= 200:
    # update alpha
    alpha=sum(df["rating"])
    for user in userRatings:
        for business in userRatings[user]:
            alpha -= (beta_u[user] + beta_i[business]+gamma_u[user]*gamma_i[business])
    alpha /= len(data)
    # update beta_u
    for user in userRatings:
        beta_u[user] = 0
        for business in userRatings[user]:
            beta_u[user] += userRatings[user][business] \
                    - (alpha + beta_i[business]+gamma_u[user]*gamma_i[business])
        beta_u[user] /= (lamda + len(userRatings[user])) 
    # update beta_i
    for business in businessRatings:
        beta_i[business] = 0
        for user in businessRatings[business]:
            beta_i[business] += businessRatings[business][user] \
                    - (alpha + beta_u[user])
        beta_i[business] /= (lamda + len(businessRatings[business]))
    # update gamma_u
    for user in userRatings:
        gamma_u[user]=0
        sumofgamma_i=0
        for business in userRatings[user]:
            sumofgamma_i+=gamma_i[business]**2
            gamma_u[user] += (userRatings[user][business] \
                -(alpha + beta_i[business]+beta_u[business]))*gamma_i[business]
            gamma_u[user]/=(lamda+sumofgamma_i)
    # update gamma_i
    for business in businessRatings:
        gamma_i[business]=0
        sumofgamma_u=0
        for user in businessRatings[business]:
            sumofgamma_u+=gamma_u[user]**2
            gamma_i[business] += (userRatings[user][business] \
                -(alpha + beta_i[business]+beta_u[business]))*gamma_u[user]
            gamma_i[business]/=(lamda+sumofgamma_u)
    iteration += 1
    

In [93]:
alpha

4.2249640440391723

In [94]:
beta_u

defaultdict(float,
            {'U187629220': 0.31310733045237638,
             'B807454692': 0.0,
             'U774608854': -0.14005309848945927,
             'B922664643': 0.0,
             'U482484447': -0.13629152706192238,
             'B940798761': 0.0,
             'B690263576': 0.0,
             'U377191451': -0.064534615188329203,
             'B264089981': 0.0,
             'B805213437': 0.0,
             'U624884435': 0.14343844360589839,
             'B562730709': 0.0,
             'U745072930': 0.27925624645109715,
             'U999540200': -0.22990085848496597,
             'U653154217': -0.16276909957655186,
             'U282622976': -0.28874197585692851,
             'B534912182': 0.0,
             'B308112546': 0.0,
             'B687352747': 0.0,
             'B738688496': 0.0,
             'U824003814': -0.29917191288777983,
             'U283552244': 0.39616317087100789,
             'B873107816': 0.0,
             'B556879074': 0.0,
             'U568566804': -0

In [95]:
predictions = open("predictions_Rating.txt", 'w')
for l in open("pairs_Rating.txt"):
    if l.startswith("userID"):
        #header
        predictions.write(l)
        continue
    u,i = l.strip().split('-')
    x=alpha
    if u in beta_u:
        x += beta_u[u]
    if i in beta_i:
        x += beta_i[i] 
    if i in gamma_i and u in gamma_u:
        x+=gamma_i[i]*gamma_u[u]
        
    predictions.write(u + '-' + i + ',' + str(x) + '\n')
    
predictions.close()