In [1]:
import pandas as pd
from matplotlib.pyplot import figure
import itertools
from tqdm import tqdm
from collections import defaultdict
import numpy as np
from random import shuffle
from surprise.model_selection import KFold

In [3]:
def get_data(filename=  'data_pos_neg_neu.txt'):
    '''
    read the list of dict for postive,negative,neutral feedback
    '''
    data = []
    with open(filename) as f:
        for i, line in enumerate(f):
            try:
                d = eval(line)
            except:
                print("*****Error*****")
                print(i,line)
                print("*********END of ERROR *******")
                break
            else:
                data.append(d)
    return data

In [4]:
data = get_data()

In [5]:
result = []
for j in range(len(data)):    
    for i in data[j]['pos']:
        result.append(('user'+str(j),i,1))
    for i in data[j]['neg']:
        result.append(('user'+str(j),i,-1))
    for i in data[j]['neu']:
        result.append(('user'+str(j),i,0))

In [6]:
shuffle(result) # 打乱数据

In [7]:
# 用dict表示
c = []
for line in tqdm(result):
    d = dict(zip(['user','product','rating'],[]))
    d['user'] = line[0]
    d['product'] = line[1]
    d['rating'] = line[2]
    c.append(d)

100%|████████████████████████████| 1859626/1859626 [00:01<00:00, 1280887.57it/s]


In [8]:
# 抄的老师的workbook4
productsPerUser = defaultdict(set)
itemsPerUser = defaultdict(set)
usersPerItem = defaultdict(set)
ratingDict = {}

In [9]:
# 把一位用户对应的所有产品整理出来
for d in c:
    user,item,rating = d['user'],d['product'],d['rating']
    productsPerUser[user].add((item,rating))
    itemsPerUser[user].add(item)
    usersPerItem[item].add(user)
    ratingDict[(user,item)] = rating

In [10]:
# 计算user average 和 item average
userAverages = {}
itemAverages = {}

for u in itemsPerUser:
    rs = [ratingDict[(u,i)] for i in itemsPerUser[u]]
    userAverages[u] = sum(rs) / len(rs)
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)

In [11]:
# 查看一共有多少pos，neg，neu
pos = 0
neg = 0
neu = 0
for i in tqdm([i for i in productsPerUser]):
    for j in productsPerUser[i]:
        if j[1] == 1:
            pos += 1
        elif j[1] == -1:
            neg += 1
        else:
            neu += 1

100%|█████████████████████████████████| 83384/83384 [00:00<00:00, 195443.56it/s]


In [12]:
pos,neg,neu

(859773, 572505, 427348)

In [13]:
# similarity functions
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

def CosineSet(s1, s2):
    # Not a proper implementation, operates on sets so correct for interactions only
    numer = len(s1.intersection(s2))
    denom = math.sqrt(len(s1)) * math.sqrt(len(s2))
    if denom == 0:
        return 0
    return numer / denom

def Cosine(i1, i2):
    # Between two items
    inter = usersPerItem[i1].intersection(usersPerItem[i2])
    numer = 0
    denom1 = 0
    denom2 = 0
    for u in inter:
        numer += ratingDict[(u,i1)]*ratingDict[(u,i2)]
    for u in usersPerItem[i1]:
        denom1 += ratingDict[(u,i1)]**2
    for u in usersPerItem[i2]:
        denom2 += ratingDict[(u,i2)]**2
    denom = math.sqrt(denom1) * math.sqrt(denom2)
    if denom == 0: return 0
    return numer / denom


In [14]:
# define the similarity function
def mostSimilar(i, N):
    similarities = []
    items = itemsPerUser[i]
    for i2 in itemsPerUser:
        if i2 == i: continue
        sim = Jaccard(items, itemsPerUser[i2])
        #sim = Pearson(i, i2) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,i2))
    similarities.sort(reverse=True)
    return similarities[:N]

In [15]:
# find top 10 similar users with user10
mostSimilar('user2',10)

[(0.6190476190476191, 'user16778'),
 (0.5909090909090909, 'user5807'),
 (0.5652173913043478, 'user22033'),
 (0.5517241379310345, 'user82333'),
 (0.5454545454545454, 'user19530'),
 (0.5357142857142857, 'user76767'),
 (0.5357142857142857, 'user70477'),
 (0.5217391304347826, 'user29118'),
 (0.52, 'user80000'),
 (0.52, 'user38407')]

In [16]:
# 所有的rating mean
ratingMean = sum([d[2] for d in result])/len(result)

In [17]:

ratingMean

0.15447622263831545

In [18]:
# 老师的代码
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

In [19]:
# 老师的代码
for d in c:
    user,item = d['user'], d['product']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [20]:
# 老师的代码
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['product']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        return ratingMean

In [21]:
# 预测用户1对python 的评分
predictRating('user1','Python')

-0.6175758828930579

In [22]:
# 找出所有的产品
all_products = np.array([i[1] for i in result])

In [23]:
# 找出所有产品的unique，去掉重复的
all_products = np.unique(all_products)

In [24]:
# 预测用户1对所有产品的评分
prediction = []
for i in all_products:
    a = predictRating('user1',i)
    prediction.append([i,a])

In [25]:
# 排列一下
prediction.sort(key=lambda x: x[1],reverse=True)

In [26]:
prediction

[['Visual Studio Code', -0.35749547504777124],
 ['Git', -0.43826036140278235],
 ['Docker', -0.5022289201799928],
 ['.NET Core / .NET 5', -0.5134162228895682],
 ['Neovim', -0.5277036525390558],
 ['ASP.NET Core', -0.5651410661117948],
 ['Python', -0.6175758828930579],
 ['PostgreSQL', -0.6400188885900393],
 ['IntelliJ', -0.64218004096276],
 ['TypeScript', -0.645872754210213],
 ['Vim', -0.6631877697668532],
 ['Rider', -0.6659900318114015],
 ['JavaScript', -0.6825125779383301],
 ['Clojure', -0.6964279214497755],
 ['C#', -0.697910119070307],
 ['React.js', -0.7010166259579661],
 ['Visual Studio', -0.7048208683180195],
 ['NumPy', -0.7108500499242555],
 ['Emacs', -0.7173477371575965],
 ['Pandas', -0.7180276587021757],
 ['AWS', -0.7290812795055218],
 ['SQL', -0.7349365502351376],
 ['IPython/Jupyter', -0.7489457284596933],
 ['HTML/CSS', -0.751913745237541],
 ['Redis', -0.7542616957693993],
 ['Rust', -0.7599665424975555],
 ['Yarn', -0.7642908799925527],
 ['Node.js', -0.7710178695612085],
 ['Notepa

In [27]:
# 写成一个function
def predict_all(user):
    #user = str(user)
    prediction = []
    for i in all_products:
        a = predictRating(user,i)
        prediction.append([i,a])
        prediction.sort(key=lambda x: x[1],reverse=True)
    return prediction

## SVM with surprise

In [28]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import Reader
from surprise import NormalPredictor
from surprise.model_selection import GridSearchCV
from surprise.model_selection import train_test_split
from surprise import accuracy

In [29]:
# split the data into 0.9 and 0.1
train = int(len(productsPerUser)*0.84)
#test = int(len(productsPerUser)*0.84)

In [30]:
# split to train and valid
all_user = [i for i in productsPerUser]
train_user = all_user[:train]
#test_user = all_user[train:test]
valid_user = all_user[train:]

In [31]:
# get the train data
train_data = []
for i in train_user:
    for j in productsPerUser[i]:
        train_data.append((i,j[0],j[1]))

In [32]:
# get the valid data
valid_data = []
for i in valid_user:
    for j in productsPerUser[i]:
        valid_data.append((i,j[0],j[1]))

In [33]:
len(train_data),len(valid_data)

(1675602, 184024)

In [34]:
# transform the data to dataframe, which is easy to handle
train_df = pd.DataFrame(train_data,columns = ['userID', 'itemID', 'rating'])

In [35]:
# scale, from -1 to 1
reader = Reader(rating_scale=(-1,1))

In [82]:
# transform the data so that surprise package can handle the data
data = Dataset.load_from_df(train_df[['userID', 'itemID', 'rating']], reader)

In [37]:
# split the train data to train and test
trainset, testset = train_test_split(data, test_size=.25,random_state=88)

In [88]:
# define the svd function 
algo = SVD()

# Train the algorithm on the trainset, and predict ratings for the testset
algo.fit(trainset)
predictions = algo.test(testset)

# Then compute RMSE
accuracy.rmse(predictions)

RMSE: 0.7242


0.7242416528642747

In [108]:
predictions = algo.test(valid_data)

precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=0)

print('average precision@5 is: ', sum(prec for prec in precisions.values()) / len(precisions))
print('average recall@5 is: ',sum(rec for rec in recalls.values()) / len(recalls))
print('average f1 score @5 is ', sum(prec*recall for prec, recall in zip(precisions.values(),recalls.values()))/len(precisions))

average precision@5 is:  0.742517363713597
average recall@5 is:  0.38397436317428796
average f1 score @5 is  0.3274214711095368


In [107]:
sum(prec*recall for prec, recall in zip(precisions.values(),recalls.values()))/len(precisions)

0.3274214711095368

In [73]:
len([i[1] for i in predictions if i[2] > 0 and i[3]>0])/len(testset)

0.19885844149333615

In [77]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [84]:
# use kfold with 5 folds and find precision and recall for 5 times
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=0)

    # Precision and recall can then be averaged over all users
    print(sum(prec for prec in precisions.values()) / len(precisions))
    print(sum(rec for rec in recalls.values()) / len(recalls))

0.6913043904254834
0.6699878741339812
0.6880389361628823
0.6619463124800661
0.6901556244823995
0.6642843525601966
0.6897264266470409
0.6624363267484099
0.6911000621807948
0.6652445920940238
