In [1]:
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from sqlalchemy import Table, select
from sqlalchemy.ext.hybrid import hybrid_property
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime

from Models.helper import *

import pandas as pd
import bcrypt
import re

In [2]:
app = Flask(__name__)

db_config = {
    'SQLALCHEMY_DATABASE_URI' : 'sqlite:///Database/doubi_database.db',
    'SQLALCHEMY_TRACK_MODIFICATIONS' : False
}

app.config.update(db_config)
db = SQLAlchemy(app)

In [3]:
followers = Table('followers', db.metadata,
    db.Column('followed_id', db.String(32), db.ForeignKey('user.u_id')),
    db.Column('follower_id', db.String(32), db.ForeignKey('user.u_id'))
)

blocked_users = Table('blocked_users', db.metadata,
    db.Column('blocked_id', db.String(32), db.ForeignKey('user.u_id')),
    db.Column('blocker_id', db.String(32), db.ForeignKey('user.u_id')),
)

users_wish_film = Table('users_wish_film', db.metadata,
    db.Column('user_id', db.String(32), db.ForeignKey('user.u_id')),
    db.Column('film_id', db.String(32), db.ForeignKey('film.f_id'))
)

bad_word = db.Table('bad_word', db.metadata,
    db.Column('w_id', db.Integer, primary_key=True, autoincrement=True),
    db.Column('word', db.String(32), nullable=False))

class User(db.Model):
    __tablename__ = 'user'
    
    u_id = db.Column(db.String(32), primary_key=True, nullable=False, unique=True, default=u_id_generator)
    username = db.Column(db.String(80), nullable=False, unique=True)
    password_hash = db.Column(db.Text, nullable=False)
    email = db.Column(db.String(80), nullable=False, unique=True)
    url_photo = db.Column(db.Text, nullable=True)
    is_admin = db.Column(db.Boolean, nullable=False, default=False)
    is_blocked = db.Column(db.Boolean, nullable=False, default=False)

    created_time = db.Column(db.DateTime, nullable=False, default=datetime.utcnow)
    updated_time = db.Column(db.DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow)

    followed = db.relationship('User', 
                                secondary=followers,
                                primaryjoin=(followers.c.follower_id == u_id),
                                secondaryjoin=(followers.c.followed_id == u_id),
                                backref=db.backref('followers', lazy='dynamic'),
                                lazy='dynamic')

    blocked = db.relationship('User',
                                secondary=blocked_users,
                                primaryjoin=(blocked_users.c.blocker_id == u_id),
                                secondaryjoin=(blocked_users.c.blocked_id == u_id),
                                backref=db.backref('blockers', lazy='dynamic'),
                                lazy='dynamic')
    
    reviews = db.relationship('Review', backref='user', lazy='dynamic')

    review_likes = db.relationship('Review_Like', backref='user', lazy='dynamic')
    review_dislikes = db.relationship('Review_Dislike', backref='user', lazy='dynamic')
    
    wish = db.relationship('Film', secondary=users_wish_film, backref='user', lazy='dynamic')
    
    @property
    def password(self):
        raise AttributeError('password is not a readable attribute')

    @password.setter
    def password(self, password):
        salt = bcrypt.gensalt()
        self.password_hash = bcrypt.hashpw(password.encode('utf-8'), salt)
    
    def verify_password(self, password):
        return bcrypt.checkpw(password.encode('utf-8'), self.password_hash)
    
    def __repr__(self):
        return '<User %r>' % self.username


    

class Film(db.Model):
    __tablename__ = 'film'

    f_id = db.Column(db.String(32), primary_key=True, nullable=False, unique=True, default=f_id_generator)
    title = db.Column(db.String(80), nullable=False)
    genre = db.Column(db.String(80), nullable=False)
    year = db.Column(db.Integer, nullable=True)
    run_time = db.Column(db.String(16), nullable=True)
    rating_imdb = db.Column(db.Float, nullable=True)
    overview = db.Column(db.String(500), nullable=True)
    director = db.Column(db.String(80), nullable=True)
    actor = db.Column(db.String(200), nullable=True)
    url_poster = db.Column(db.Text, nullable=True)
    rating_doubi = db.Column(db.Float, nullable=True)

    created_time = db.Column(db.DateTime, nullable=False, default=datetime.utcnow)
    updated_time = db.Column(db.DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow)

    reviews = db.relationship('Review', backref='film', lazy='dynamic')
    
    @hybrid_property
    def rating(self):
        reviews = self.reviews.all()
        if len(reviews) == 0:
            return 0
        else:
            return round(sum(review.rating for review in reviews) / len(reviews), 1)
        
    @rating.expression
    def rating(cls):
        return select(func.avg(Review.rating)).where(Review.f_id == cls.f_id)
    
    @property
    def rating_distribution(self):
        reviews = self.reviews.all()
        rating_distribution = {x: 0 for x in range(0, 5)}
        for review in reviews:
            rating_distribution[review.rating] = rating_distribution.get(review.rating, 0) + 1
        return rating_distribution

    @property
    def genres(self):
        return [genre.strip() for genre in self.genre.split(',')]
    
    @property
    def actors(self):
        return [actor.strip() for actor in self.actor.split(',')]
    
    
    def rating_customized(self, current_user):
        reviews = self.reviews.all()
        blocked_id = [x.u_id for x in current_user.blocked.all()]
        reviews = [x for x in reviews if x.u_id not in blocked_id]
        if len(reviews) == 0:
            return 0
        else:
            return round(sum(review.rating for review in reviews) / len(reviews), 1)
        
    def rating_distribution_customized(self, current_user):
        reviews = self.reviews.all()
        blocked_id = [x.u_id for x in current_user.blocked.all()]
        reviews = [x for x in reviews if x.u_id not in blocked_id]
        rating_distribution = {x: 0 for x in range(0, 5)}
        for review in reviews:
            rating_distribution[review.rating] = rating_distribution.get(review.rating, 0) + 1
        return rating_distribution
    
    def __repr__(self):
        return '<Film %r>' % self.title
    


class Review(db.Model):
    __tablename__ = 'review'

    r_id = db.Column(db.String(32), primary_key=True, nullable=False, unique=True, default=r_id_generator)
    u_id = db.Column(db.String(32), db.ForeignKey('user.u_id'), nullable=False)
    f_id = db.Column(db.String(32), db.ForeignKey('film.f_id'), nullable=False)
    content = db.Column(db.String(500), nullable=True)
    rating = db.Column(db.Integer, nullable=False)
    bad_word = db.Column(db.Boolean, nullable=False, default=False)

    created_time = db.Column(db.DateTime, nullable=False, default=datetime.utcnow)
    updated_time = db.Column(db.DateTime, nullable=False, default=datetime.utcnow, onupdate=datetime.utcnow)

    likes = db.relationship('Review_Like', backref='review', lazy='dynamic')
    dislikes = db.relationship('Review_Dislike', backref='review', lazy='dynamic')
    
    @property
    def like(self):
        return self.likes.count()
    
    @property
    def dislike(self):
        return self.dislikes.count()


class Review_Like(db.Model):
    __tablename__ = 'review_like'

    r_id = db.Column(db.String(32), db.ForeignKey('review.r_id'), primary_key=True, nullable=False)
    u_id = db.Column(db.String(32), db.ForeignKey('user.u_id'), primary_key=True, nullable=False)

    created_time = db.Column(db.DateTime, nullable=False, default=datetime.utcnow)


class Review_Dislike(db.Model):
    __tablename__ = 'review_dislike'

    r_id = db.Column(db.String(32), db.ForeignKey('review.r_id'), primary_key=True, nullable=False)
    u_id = db.Column(db.String(32), db.ForeignKey('user.u_id'), primary_key=True, nullable=False)

    created_time = db.Column(db.DateTime, nullable=False, default=datetime.utcnow)

In [4]:
ENGLISH_STOP_WORDS = set([
    'a',
    'about',
    'above',
    'across',
    'after',
    'afterwards',
    'again',
    'against',
    'ain',
    'all',
    'almost',
    'alone',
    'along',
    'already',
    'also',
    'although',
    'always',
    'am',
    'among',
    'amongst',
    'amoungst',
    'amount',
    'an',
    'and',
    'another',
    'any',
    'anyhow',
    'anyone',
    'anything',
    'anyway',
    'anywhere',
    'are',
    'aren',
    'around',
    'as',
    'at',
    'back',
    'be',
    'became',
    'because',
    'become',
    'becomes',
    'becoming',
    'been',
    'before',
    'beforehand',
    'behind',
    'being',
    'below',
    'beside',
    'besides',
    'between',
    'beyond',
    'bill',
    'both',
    'bottom',
    'but',
    'by',
    'call',
    'can',
    'cannot',
    'cant',
    'co',
    'con',
    'could',
    'couldn',
    'couldnt',
    'cry',
    'd',
    'de',
    'describe',
    'detail',
    'did',
    'didn',
    'do',
    'does',
    'doesn',
    'doing',
    'don',
    'done',
    'down',
    'due',
    'during',
    'each',
    'eg',
    'eight',
    'either',
    'eleven',
    'else',
    'elsewhere',
    'empty',
    'enough',
    'etc',
    'even',
    'ever',
    'every',
    'everyone',
    'everything',
    'everywhere',
    'except',
    'few',
    'fifteen',
    'fify',
    'fill',
    'find',
    'fire',
    'first',
    'five',
    'for',
    'former',
    'formerly',
    'forty',
    'found',
    'four',
    'from',
    'front',
    'full',
    'further',
    'get',
    'give',
    'go',
    'had',
    'hadn',
    'has',
    'hasn',
    'hasnt',
    'have',
    'haven',
    'having',
    'he',
    'hence',
    'her',
    'here',
    'hereafter',
    'hereby',
    'herein',
    'hereupon',
    'hers',
    'herself',
    'him',
    'himself',
    'his',
    'how',
    'however',
    'hundred',
    'i',
    'ie',
    'if',
    'in',
    'inc',
    'indeed',
    'interest',
    'into',
    'is',
    'isn',
    'it',
    'its',
    'itself',
    'just',
    'keep',
    'last',
    'latter',
    'latterly',
    'least',
    'less',
    'll',
    'ltd',
    'm',
    'ma',
    'made',
    'many',
    'may',
    'me',
    'meanwhile',
    'might',
    'mightn',
    'mill',
    'mine',
    'more',
    'moreover',
    'most',
    'mostly',
    'move',
    'much',
    'must',
    'mustn',
    'my',
    'myself',
    'name',
    'namely',
    'needn',
    'neither',
    'never',
    'nevertheless',
    'next',
    'nine',
    'no',
    'nobody',
    'none',
    'noone',
    'nor',
    'not',
    'nothing',
    'now',
    'nowhere',
    'o',
    'of',
    'off',
    'often',
    'on',
    'once',
    'one',
    'only',
    'onto',
    'or',
    'other',
    'others',
    'otherwise',
    'our',
    'ours',
    'ourselves',
    'out',
    'over',
    'own',
    'part',
    'per',
    'perhaps',
    'please',
    'put',
    'rather',
    're',
    's',
    'same',
    'see',
    'seem',
    'seemed',
    'seeming',
    'seems',
    'serious',
    'several',
    'shan',
    'she',
    'should',
    'shouldn',
    'show',
    'side',
    'since',
    'sincere',
    'six',
    'sixty',
    'so',
    'some',
    'somehow',
    'someone',
    'something',
    'sometime',
    'sometimes',
    'somewhere',
    'still',
    'such',
    'system',
    't',
    'take',
    'ten',
    'than',
    'that',
    'the',
    'their',
    'theirs',
    'them',
    'themselves',
    'then',
    'thence',
    'there',
    'thereafter',
    'thereby',
    'therefore',
    'therein',
    'thereupon',
    'these',
    'they',
    'thick',
    'thin',
    'third',
    'this',
    'those',
    'though',
    'three',
    'through',
    'throughout',
    'thru',
    'thus',
    'to',
    'together',
    'too',
    'top',
    'toward',
    'towards',
    'twelve',
    'twenty',
    'two',
    'un',
    'under',
    'until',
    'up',
    'upon',
    'us',
    've',
    'very',
    'via',
    'was',
    'wasn',
    'we',
    'well',
    'were',
    'weren',
    'what',
    'whatever',
    'when',
    'whence',
    'whenever',
    'where',
    'whereafter',
    'whereas',
    'whereby',
    'wherein',
    'whereupon',
    'wherever',
    'whether',
    'which',
    'while',
    'whither',
    'who',
    'whoever',
    'whole',
    'whom',
    'whose',
    'why',
    'will',
    'with',
    'within',
    'without',
    'won',
    'would',
    'wouldn',
    'y',
    'yet',
    'you',
    'your',
    'yours',
    'yourself',
    'yourselves'
])

In [5]:
films = Film.query.all()
films = [[
    film.f_id,
    film.genre,
    film.director,
    film.actor,
    film.title,
    film.overview,
] for film in films]
df = pd.DataFrame(films, columns=['f_id', 'genre', 'director', 'actor', 'title', 'overview'])

In [6]:
df['kwd'] = ''

for index, row in df.iterrows():
    
    overview = row['overview']
    # remove punctuation
    overview = re.sub(r'[^\w\s]', '', overview)
    # convert to lowercase
    overview = overview.lower()
    # remove stopwords
    overview = [word for word in overview.split() if word not in ENGLISH_STOP_WORDS]
    # print(overview)
    row['kwd'] = overview

df['kwd'].head()

0    [imprisoned, men, bond, number, years, finding...
1    [organized, crime, dynastys, aging, patriarch,...
2    [menace, known, joker, wreaks, havoc, chaos, p...
3    [early, life, career, vito, corleone, 1920s, n...
4    [jury, holdout, attempts, prevent, miscarriage...
Name: kwd, dtype: object

In [7]:
df['genre'] = df['genre'].map(lambda x: x.split(','))
df['actor'] = df['actor'].map(lambda x: x.split(',')[:3])
df['director'] = df['director'].map(lambda x: x.split(','))
for index, row in df.iterrows():
    row['genre'] = [x.lower().replace(' ','') for x in row['genre']]
    row['actor'] = [x.lower().replace(' ','') for x in row['actor']]
    row['director'] = [x.lower().replace(' ','') for x in row['director']]
df

Unnamed: 0,f_id,genre,director,actor,title,overview,kwd
0,b6c27b9bba493012834fa0f4f64dd519,[drama],[frankdarabont],"[timrobbins, morganfreeman, bobgunton]",The Shawshank Redemption,Two imprisoned men bond over a number of years...,"[imprisoned, men, bond, number, years, finding..."
1,9e0f2212f7b43f0299ed273e076ae572,"[crime, drama]",[francisfordcoppola],"[marlonbrando, alpacino, jamescaan]",The Godfather,An organized crime dynasty's aging patriarch t...,"[organized, crime, dynastys, aging, patriarch,..."
2,1e6393419260322cb338ae629d3c0cb9,"[action, crime, drama]",[christophernolan],"[christianbale, heathledger, aaroneckhart]",The Dark Knight,When the menace known as the Joker wreaks havo...,"[menace, known, joker, wreaks, havoc, chaos, p..."
3,f64262d36cfb3737b2107a6973584c41,"[crime, drama]",[francisfordcoppola],"[alpacino, robertdeniro, robertduvall]",The Godfather: Part II,The early life and career of Vito Corleone in ...,"[early, life, career, vito, corleone, 1920s, n..."
4,d8de113e940e37cc981c514bb53d81e0,"[crime, drama]",[sidneylumet],"[henryfonda, leej.cobb, martinbalsam]",12 Angry Men,A jury holdout attempts to prevent a miscarria...,"[jury, holdout, attempts, prevent, miscarriage..."
...,...,...,...,...,...,...,...
996,39940adf0eb5375b96cb8748cf061c2a,"[drama, western]",[georgestevens],"[elizabethtaylor, rockhudson, jamesdean]",Giant,Sprawling epic covering the life of a Texas ca...,"[sprawling, epic, covering, life, texas, cattl..."
997,d547c00412023be5bef54a6ae1371263,"[drama, romance, war]",[fredzinnemann],"[burtlancaster, montgomeryclift, deborahkerr]",From Here to Eternity,"In Hawaii in 1941, a private is cruelly punish...","[hawaii, 1941, private, cruelly, punished, box..."
998,673913d08a18381ca646656f14c0e84d,"[drama, war]",[alfredhitchcock],"[tallulahbankhead, johnhodiak, walterslezak]",Lifeboat,Several survivors of a torpedoed merchant ship...,"[survivors, torpedoed, merchant, ship, world, ..."
999,9f20fe9de8173834a1e76a114a9e2c26,"[crime, mystery, thriller]",[alfredhitchcock],"[robertdonat, madeleinecarroll, luciemannheim]",The 39 Steps,A man in London tries to help a counter-espion...,"[man, london, tries, help, counterespionage, a..."


In [8]:
df['bag_of_words'] = ''
columns = ['genre', 'director', 'actor', 'kwd']
for index, row in df.iterrows():
    words = ''
    for col in columns:
        words += ' '.join(row[col]) + ' '
    row['bag_of_words'] = words
    
df = df[['title','bag_of_words']]
df

Unnamed: 0,title,bag_of_words
0,The Shawshank Redemption,drama frankdarabont timrobbins morganfreeman b...
1,The Godfather,crime drama francisfordcoppola marlonbrando al...
2,The Dark Knight,action crime drama christophernolan christianb...
3,The Godfather: Part II,crime drama francisfordcoppola alpacino robert...
4,12 Angry Men,crime drama sidneylumet henryfonda leej.cobb m...
...,...,...
996,Giant,drama western georgestevens elizabethtaylor ro...
997,From Here to Eternity,drama romance war fredzinnemann burtlancaster ...
998,Lifeboat,drama war alfredhitchcock tallulahbankhead joh...
999,The 39 Steps,crime mystery thriller alfredhitchcock robertd...


In [9]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['bag_of_words'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
print(cosine_sim)


[[1.         0.05564149 0.04950738 ... 0.05057217 0.         0.        ]
 [0.05564149 1.         0.14048787 ... 0.04783649 0.08240856 0.        ]
 [0.04950738 0.14048787 1.         ... 0.04256283 0.03666178 0.        ]
 ...
 [0.05057217 0.04783649 0.04256283 ... 1.         0.03745029 0.        ]
 [0.         0.08240856 0.03666178 ... 0.03745029 1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [10]:
indices = pd.Series(df['title'])
indices

0       The Shawshank Redemption
1                  The Godfather
2                The Dark Knight
3         The Godfather: Part II
4                   12 Angry Men
                  ...           
996                        Giant
997        From Here to Eternity
998                     Lifeboat
999                 The 39 Steps
1000                          sb
Name: title, Length: 1001, dtype: object

In [11]:
def recommend(title, cosine_sim = cosine_sim):
    recommended_movies = []
    idx = indices[indices == title].index[0]
    score_series = pd.Series(cosine_sim[idx]).sort_values(ascending = False)
    top_10_indices = list(score_series.iloc[1:11].index)
    
    for i in top_10_indices:
        recommended_movies.append(list(df['title'])[i])
        
    return recommended_movies


In [12]:
recommend('The Dark Knight Rises')

['Batman Begins',
 'The Incredibles',
 'The Dark Knight',
 'Die Hard: With a Vengeance',
 'Shichinin no samurai',
 'Interstellar',
 'Yip Man',
 'The Blues Brothers',
 'First Blood',
 'Mad Max 2']

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [14]:
reviews = Review.query.all()
reviews = [[
    review.f_id,
    review.u_id,
    review.rating,
] for review in reviews]

df = pd.DataFrame(reviews, columns=['f_id', 'u_id', 'rating'])

In [15]:
matrix = df.pivot_table(index='u_id', columns='f_id', values='rating')

In [16]:
user_similarity = matrix.T.corr()
user_similarity

u_id,008ef55e4cbb3347a7816d9145011d53,008fdf556dc23c1aa9e8ad5c2a6c60db,00cca7767cba3ae3a797f46eb1ff4140,00ccd6470c0f3f8d96f3f554401407a7,00e85ef45c9e32169a16e576af71020d,00ead050a7e13c1d8de6a65ba316a7d5,0103f36967bc3656be5d8e59c5917945,01390892caaa311f8b5e071b3dd4bd5a,01bd4421f5d536f997d46f6ea3fe9b92,025bc11a2c64356bba7f72344f539d57,...,fd44142ea86138f29ed3fd545775a480,fd7616a0f81c3cd1a8da2c065a42bf3f,fda7f019076438978ee7e751eab5b2d3,fe0bfde5fecd34dea20e249e7316a9af,fe2a2f541b80370a9fe1b2d9eaa61e3e,feff35e6bd5c32fe8d6fd5b8c253442d,ff44a07034fa32399278d051ca2dafa4,ff6eed84dfab39109d36ef7e0893d552,ff95195eb23c3d43b06d165876e2a99d,ffe1a4d1d0023589b14424541fe65a91
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
008ef55e4cbb3347a7816d9145011d53,1.0,,,,,,,,,,...,,,,,,,,,,
008fdf556dc23c1aa9e8ad5c2a6c60db,,1.0,,,,,,,,,...,,,,,,,,,,
00cca7767cba3ae3a797f46eb1ff4140,,,1.0,,,,,,,,...,,,,,,,,,,
00ccd6470c0f3f8d96f3f554401407a7,,,,1.0,,,,,,,...,,,,,,,,,,
00e85ef45c9e32169a16e576af71020d,,,,,1.000000,,,,,,...,,,,,1.0,-0.891042,,-0.296432,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
feff35e6bd5c32fe8d6fd5b8c253442d,,,,,-0.891042,,,,,,...,,,,,-1.0,1.000000,,1.000000,,
ff44a07034fa32399278d051ca2dafa4,,,,,,,,,,,...,,,,,,,1.0,,,
ff6eed84dfab39109d36ef7e0893d552,,,,,-0.296432,,,,,,...,,,,,-0.5,1.000000,,1.000000,,
ff95195eb23c3d43b06d165876e2a99d,,,,,,,,,,,...,,,,,,,,,1.0,


In [17]:
def predict(uid, iid, ratings_matrix, user_similar):
    '''
    预测给定用户对给定物品的评分值
    :param uid: 用户ID
    :param iid: 物品ID
    :param ratings_matrix: 用户-物品评分矩阵
    :param user_similar: 用户两两相似度矩阵
    :return: 预测的评分值
    '''
    print("开始预测用户<%s>对电影<%s>的评分..."%(uid, iid))
    # 1. 找出uid用户的相似用户
    similar_users = user_similar[uid].drop([uid]).dropna()
    # 相似用户筛选规则：正相关的用户
    similar_users = similar_users.where(similar_users>0).dropna()
    if similar_users.empty is True:
        raise Exception("用户<%s>没有相似的用户" % uid)

    # 2. 从uid用户的近邻相似用户中筛选出对iid物品有评分记录的近邻用户
    ids = set(ratings_matrix[iid].dropna().index)&set(similar_users.index)
    finally_similar_users = similar_users.loc[list(ids)]

    # 3. 结合uid用户与其近邻用户的相似度预测uid用户对iid物品的评分
    numerator = 0    # 评分预测公式的分子部分的值
    denominator = 0    # 评分预测公式的分母部分的值
    for sim_uid, similarity in finally_similar_users.iteritems():
        # 近邻用户的评分数据
        sim_user_rated_movies = ratings_matrix.loc[sim_uid].dropna()
        # 近邻用户对iid物品的评分
        sim_user_rating_for_item = sim_user_rated_movies[iid]
        # 计算分子的值
        numerator += similarity * sim_user_rating_for_item
        # 计算分母的值
        denominator += similarity

    # 计算预测的评分值并返回
    predict_rating = numerator/denominator
    print("预测出用户<%s>对电影<%s>的评分：%0.2f" % (uid, iid, predict_rating))
    return round(predict_rating, 2)

In [18]:
movielens_rating = pd.read_csv('ratings.csv')
# movielens_rating.head()
movielens_movie = pd.read_csv('movies.csv')

In [19]:
films = Film.query.all()
films = {film.title: film.f_id for film in films}

In [20]:
new_film = pd.DataFrame()
new_film['movieId'] = 0
new_film['title'] = ''


new_film_index = 0

# remove movies from movielens_rating that are not in films
for index, row in movielens_movie.iterrows():
    # remove year from title
    row['title'] = row['title'].split('(')[0]
    # remove spaces from title
    row['title'] = row['title'].strip()
    if row['title'] in films.keys():
        new_film.loc[new_film_index] = [row['movieId'], row['title']]
        new_film_index += 1

In [21]:
new_film.reset_index(drop=True, inplace=True)
new_film

Unnamed: 0,movieId,title
0,1,Toy Story
1,6,Heat
2,7,Sabrina
3,16,Casino
4,17,Sense and Sensibility
...,...,...
565,180263,The Shining
566,183897,Isle of Dogs
567,187541,Incredibles 2
568,187593,Deadpool 2


In [53]:
new_reviews = pd.DataFrame()
new_reviews['userId'] = 0
new_reviews['movieId'] = 0
new_reviews['rating'] = 0.0
new_reviews['timestamp'] = 0
new_reviews_index = 0

for index, row in movielens_rating.iterrows():
    if row['movieId'] in new_film['movieId'].values:
        new_reviews.loc[new_reviews_index] = [row['userId'], row['movieId'], row['rating'], row['timestamp']]
        new_reviews_index += 1

new_reviews.shape

(23932, 4)

In [54]:
new_reviews['userId'] = new_reviews['userId'].astype(int)
new_reviews['movieId'] = new_reviews['movieId'].astype(int)
new_reviews['rating'] = new_reviews['rating'].astype(float)
new_reviews['timestamp'] = new_reviews['timestamp'].astype(int)
new_reviews['timestamp'] = pd.to_datetime(new_reviews['timestamp'], unit='s')

In [55]:
user_matrix = new_reviews.pivot_table(index='userId', columns='movieId', values='rating')
user_similarity = user_matrix.T.corr()
user_similarity

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,,,0.284379,-0.433861,1.582806e-17,-0.390322,3.208445e-01,1.000000,0.086258,...,4.550560e-16,-0.127710,-0.181531,-0.438529,0.821460,-0.059042,0.115642,0.136484,-0.447214,-0.045030
2,,1.000000,,,,,,,,1.000000,...,-3.818813e-01,,-1.000000,,,0.500000,,1.000000,,0.843387
3,,,,,,,,,,,...,,,,,,,,,,
4,0.284379,,,1.000000,-0.534522,-8.420325e-03,0.284747,-1.000000e+00,,0.468165,...,-3.012440e-01,0.243187,0.001489,-0.790569,0.885454,0.040058,-0.238310,-0.368443,,-0.476075
5,-0.433861,,,-0.534522,1.000000,1.294831e-01,0.033408,5.345225e-01,,-1.000000,...,1.000000e+00,-0.275487,-0.128990,0.462910,-0.579365,0.661477,0.160422,0.471477,0.866025,-0.382546
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-0.059042,0.500000,,0.040058,0.661477,-4.839440e-01,-0.114694,3.680497e-01,-0.200446,-0.471893,...,2.421994e-01,0.191943,0.181444,0.804658,-0.196475,1.000000,-0.328595,0.050533,0.804030,-0.011783
607,0.115642,,,-0.238310,0.160422,2.617917e-01,0.299803,-1.316563e-16,,1.000000,...,9.799579e-01,-0.172459,-0.004059,-0.845154,0.154091,-0.328595,1.000000,-0.061432,-0.759257,0.008402
608,0.136484,1.000000,,-0.368443,0.471477,-4.628529e-01,-0.181757,2.345208e-01,-0.121867,-0.682434,...,4.168637e-01,0.441942,0.136525,0.519896,-0.366619,0.050533,-0.061432,1.000000,0.429700,0.039114
609,-0.447214,,,,0.866025,-1.099853e-01,0.377415,2.425356e-01,,,...,,-0.495074,0.430820,1.000000,-0.605058,0.804030,-0.759257,0.429700,1.000000,-0.564076


In [56]:
predict(1.0,161582.0,user_matrix,user_similarity)

开始预测用户<1.0>对电影<161582.0>的评分...
预测出用户<1.0>对电影<161582.0>的评分：3.81


3.81

In [57]:
new_reviews_1 = new_reviews[:5450]
new_reviews_2 = new_reviews[5450:]
print(new_reviews_1.shape)
print(new_reviews_2.shape)

(5450, 4)
(18482, 4)


In [58]:

# match user and film from two dataset
users = User.query.all()
users_u_id = [user.u_id for user in users]
userId = new_reviews['userId'].unique().tolist()

user_dict = {
    userId[i]: users_u_id[i] for i in range(len(userId))
}

user_dict

{1: 'b1392eecf57e302db360584c016212b9',
 2: 'f2627c6a94923c689b682f5c44ab2556',
 3: 'c5c1541f2ee83dbbaa4564718db20154',
 4: '2b24e2f3a6ca385f891c0f5eab22bc2f',
 5: '66be133c88b230bdb2f5c3b643a5c6b2',
 6: 'c9ab8426562b31acbcae35091897450b',
 7: '27081a46ad5d3a6d9d858b331a3d88a0',
 8: '11adf0b0d2f93f2f96a4bf871d37bf99',
 9: '7c2c5911f6e835489af17fdb0d7dfe89',
 10: 'e10ebae6655730fb85125c0a255b924c',
 11: 'c17cc931bd64374facf7d8c60aea69a9',
 12: '4fe4249c19093696a0e1caf52f5e474d',
 13: 'b4b2f871f4ca3cc8b3ce307b858a9581',
 14: '853d689969fe3c218954fc5dd3528ce6',
 15: '34a9f61585fe3d939720c79610cb6ea9',
 16: '45b9dc58c5583e1aad611ecebe691329',
 17: 'f7988376a1d6359eacf2ebf68d7f4363',
 18: 'eb4813e8c6463ddd9835dd5ea1acadb5',
 19: '4f3712d6f5eb3ea8abc3dee3c2fc1cec',
 20: 'ba993ca7a7df3b5ab391833ff5d97f0e',
 21: '545a637af87a302a95f2a84b38d1620f',
 22: '0b3ed35ae7993b4d82214db47014d695',
 23: 'f1ed995ae967307cb8b674f99eee0b02',
 24: 'd2ab51da25f036149fd76b744dacab0f',
 25: 'bb40bc6e440e3b4896a

In [59]:
new_reviews['userId'] = new_reviews['userId'].map(user_dict)
new_reviews

Unnamed: 0,userId,movieId,rating,timestamp
0,b1392eecf57e302db360584c016212b9,1,4.0,2000-07-30 18:45:03
1,b1392eecf57e302db360584c016212b9,6,4.0,2000-07-30 18:37:04
2,b1392eecf57e302db360584c016212b9,110,4.0,2000-07-30 18:36:16
3,b1392eecf57e302db360584c016212b9,223,3.0,2000-07-30 18:16:25
4,b1392eecf57e302db360584c016212b9,235,4.0,2000-07-30 18:15:08
...,...,...,...,...
23927,c0d53dfed6923bb48b9f6ca55e39b161,161582,4.0,2017-05-03 21:42:39
23928,c0d53dfed6923bb48b9f6ca55e39b161,162350,3.5,2017-05-03 22:19:31
23929,c0d53dfed6923bb48b9f6ca55e39b161,164179,5.0,2017-05-03 21:07:11
23930,c0d53dfed6923bb48b9f6ca55e39b161,168250,5.0,2017-05-08 19:50:47


In [60]:
# replace movieid with movie title
new_reviews = new_reviews[['userId', 'movieId', 'rating', 'timestamp']]
new_reviews = new_reviews.merge(new_film, on='movieId', how='left')
films = Film.query.all()
films = {film.title: film.f_id for film in films}
new_reviews['movieId'] = new_reviews['title'].map(films)
new_reviews

Unnamed: 0,userId,movieId,rating,timestamp,title
0,b1392eecf57e302db360584c016212b9,1adcd4d261d139a488eef8bdf5f0046c,4.0,2000-07-30 18:45:03,Toy Story
1,b1392eecf57e302db360584c016212b9,f252741f1eb7331686250852d4e7c382,4.0,2000-07-30 18:37:04,Heat
2,b1392eecf57e302db360584c016212b9,c158939729143e459232fb8b2c229a91,4.0,2000-07-30 18:36:16,Braveheart
3,b1392eecf57e302db360584c016212b9,6945fbf202c3374c98f75fb5af1bac1b,3.0,2000-07-30 18:16:25,Clerks
4,b1392eecf57e302db360584c016212b9,d872996fa8b932bbbde5f502eb64e7de,4.0,2000-07-30 18:15:08,Ed Wood
...,...,...,...,...,...
23927,c0d53dfed6923bb48b9f6ca55e39b161,983eed6dd55d34a3acf265016c9f8df2,4.0,2017-05-03 21:42:39,Hell or High Water
23928,c0d53dfed6923bb48b9f6ca55e39b161,0a38bb8cf26a3589a8e4cd1ee7de8dfa,3.5,2017-05-03 22:19:31,The Magnificent Seven
23929,c0d53dfed6923bb48b9f6ca55e39b161,a32ec4dbc21e3ca58906eb1c6dbe7613,5.0,2017-05-03 21:07:11,Arrival
23930,c0d53dfed6923bb48b9f6ca55e39b161,02b7f1d860b33123b8e10eabfcc1ea73,5.0,2017-05-08 19:50:47,Get Out


In [61]:
new_reviews['rating'] = new_reviews['rating'].astype(int)
new_reviews = new_reviews[['userId', 'movieId', 'rating', 'timestamp']]
new_reviews

Unnamed: 0,userId,movieId,rating,timestamp
0,b1392eecf57e302db360584c016212b9,1adcd4d261d139a488eef8bdf5f0046c,4,2000-07-30 18:45:03
1,b1392eecf57e302db360584c016212b9,f252741f1eb7331686250852d4e7c382,4,2000-07-30 18:37:04
2,b1392eecf57e302db360584c016212b9,c158939729143e459232fb8b2c229a91,4,2000-07-30 18:36:16
3,b1392eecf57e302db360584c016212b9,6945fbf202c3374c98f75fb5af1bac1b,3,2000-07-30 18:16:25
4,b1392eecf57e302db360584c016212b9,d872996fa8b932bbbde5f502eb64e7de,4,2000-07-30 18:15:08
...,...,...,...,...
23927,c0d53dfed6923bb48b9f6ca55e39b161,983eed6dd55d34a3acf265016c9f8df2,4,2017-05-03 21:42:39
23928,c0d53dfed6923bb48b9f6ca55e39b161,0a38bb8cf26a3589a8e4cd1ee7de8dfa,3,2017-05-03 22:19:31
23929,c0d53dfed6923bb48b9f6ca55e39b161,a32ec4dbc21e3ca58906eb1c6dbe7613,5,2017-05-03 21:07:11
23930,c0d53dfed6923bb48b9f6ca55e39b161,02b7f1d860b33123b8e10eabfcc1ea73,5,2017-05-08 19:50:47


In [62]:
new_reviews_1 = new_reviews[:5450]
new_reviews_2 = new_reviews[5450:]
print(new_reviews_1.shape)
print(new_reviews_2.shape)

(5450, 4)
(18482, 4)


In [42]:
reviews = Review.query.all()
for index, review in enumerate(reviews):
    print("dealing with review %d/5450" % index)
    review.u_id = new_reviews_1['userId'][index]
    review.f_id = new_reviews_1['movieId'][index]
    review.rating = new_reviews_1['rating'][index]
db.session.commit()

dealing with review 0/5450
dealing with review 1/5450
dealing with review 2/5450
dealing with review 3/5450
dealing with review 4/5450
dealing with review 5/5450
dealing with review 6/5450
dealing with review 7/5450
dealing with review 8/5450
dealing with review 9/5450
dealing with review 10/5450
dealing with review 11/5450
dealing with review 12/5450
dealing with review 13/5450
dealing with review 14/5450
dealing with review 15/5450
dealing with review 16/5450
dealing with review 17/5450
dealing with review 18/5450
dealing with review 19/5450
dealing with review 20/5450
dealing with review 21/5450
dealing with review 22/5450
dealing with review 23/5450
dealing with review 24/5450
dealing with review 25/5450
dealing with review 26/5450
dealing with review 27/5450
dealing with review 28/5450
dealing with review 29/5450
dealing with review 30/5450
dealing with review 31/5450
dealing with review 32/5450
dealing with review 33/5450
dealing with review 34/5450
dealing with review 35/5450
de

In [67]:
db.session.rollback()
reviews = Review.query.all()
contents = [review.content for review in reviews]

In [69]:
import random
db.session.rollback()
for index, row in new_reviews_2.iterrows():
    print("dealing with review %d/23931" % index)
    review = Review(
        u_id=row['userId'],
        f_id=row['movieId'],
        content=random.choice(contents),
        rating=row['rating'],
        created_time=row['timestamp']
    )
    # print(type(row['timestamp']))
    # break
    db.session.add(review)
db.session.commit()

dealing with review 5450/23931
dealing with review 5451/23931
dealing with review 5452/23931
dealing with review 5453/23931
dealing with review 5454/23931
dealing with review 5455/23931
dealing with review 5456/23931
dealing with review 5457/23931
dealing with review 5458/23931
dealing with review 5459/23931
dealing with review 5460/23931
dealing with review 5461/23931
dealing with review 5462/23931
dealing with review 5463/23931
dealing with review 5464/23931
dealing with review 5465/23931
dealing with review 5466/23931
dealing with review 5467/23931
dealing with review 5468/23931
dealing with review 5469/23931
dealing with review 5470/23931
dealing with review 5471/23931
dealing with review 5472/23931
dealing with review 5473/23931
dealing with review 5474/23931
dealing with review 5475/23931
dealing with review 5476/23931
dealing with review 5477/23931
dealing with review 5478/23931
dealing with review 5479/23931
dealing with review 5480/23931
dealing with review 5481/23931
dealing 

In [81]:
reviews = Review.query.all()
for index, review in enumerate(reviews):
    review.rating = int(new_reviews['rating'][index])
    # print(type(int(new_reviews['rating'][index])))
    # break
db.session.commit()

In [82]:
films = Film.query.all()
for film in films:
    film.rating_doubi = film.rating
db.session.commit()