In [1]:
import pandas as pd

In [2]:
movies_df = pd.read_csv('data/movies.csv')
ratings_df = pd.read_csv('data/ratings.csv')
tags_df = pd.read_csv('data/tags.csv')
links_df = pd.read_csv('data/links.csv')

In [3]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [6]:
links_df

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [7]:
ratings_df.describe()

Unnamed: 0,userId,movieId,rating,timestamp
count,100836.0,100836.0,100836.0,100836.0
mean,326.127564,19435.295718,3.501557,1205946000.0
std,182.618491,35530.987199,1.042529,216261000.0
min,1.0,1.0,0.5,828124600.0
25%,177.0,1199.0,3.0,1019124000.0
50%,325.0,2991.0,3.5,1186087000.0
75%,477.0,8122.0,4.0,1435994000.0
max,610.0,193609.0,5.0,1537799000.0


# Popularity Recommender

## "best" books : both high rated and popular

**consider average rating & review count**

### 1. without normalizing data

In [8]:
# 1. without normalizing data

rating_count_df = ratings_df.groupby('movieId')['rating'].agg(['mean', 'count'])

weight_rating_count = 0.25
weight_rating_average = 0.75

rating_count_df['popularity_score'] = (
    weight_rating_count * rating_count_df['mean'] +
    weight_rating_average * rating_count_df['count']
)

popular_books = rating_count_df.sort_values(by='popularity_score', ascending=False).reset_index()

# top 10
popular_books.merge(movies_df[['movieId','title']], how='left').head(10)

Unnamed: 0,movieId,mean,count,popularity_score,title
0,356,4.164134,329,247.791033,Forrest Gump (1994)
1,318,4.429022,317,238.857256,"Shawshank Redemption, The (1994)"
2,296,4.197068,307,231.299267,Pulp Fiction (1994)
3,593,4.16129,279,210.290323,"Silence of the Lambs, The (1991)"
4,2571,4.192446,278,209.548112,"Matrix, The (1999)"
5,260,4.231076,251,189.307769,Star Wars: Episode IV - A New Hope (1977)
6,480,3.75,238,179.4375,Jurassic Park (1993)
7,110,4.031646,237,178.757911,Braveheart (1995)
8,589,3.970982,224,168.992746,Terminator 2: Judgment Day (1991)
9,527,4.225,220,166.05625,Schindler's List (1993)


### 2. normalize observations & give weight to decide imapact of feature

In [9]:
# 2. normalize observations & use weight to decide imapact of feature

test_df = ratings_df.groupby('movieId').agg({'rating':'mean','userId':'count'})

# normalize rating
test_df['rating_normalized'] = (test_df['rating'] - test_df['rating'].min()) / (test_df['rating'].max() - test_df['rating'].min())

# normalize count
test_df['count_normalized'] = (test_df['userId'] - test_df['userId'].min()) / (test_df['userId'].max() - test_df['userId'].min())

# decide weight of feature
rating_weight = 0.75
count_weight = 0.25

# normalized & weighted rating & count
test_df['rating_weighted'] = test_df['rating_normalized'] * rating_weight
test_df['count_weighted'] = test_df['count_normalized'] * count_weight

# final rating - decides popularity
test_df['final_rating'] = test_df['rating_weighted'] + test_df['count_weighted']

# final test_df
test_df = test_df.sort_values('final_rating',ascending=False).reset_index()

# get movie title with highest final rating
top_n = movies_df[['movieId','title']].merge(test_df[['movieId','final_rating']], how='right')

top_n.head(10)

Unnamed: 0,movieId,title,final_rating
0,318,"Shawshank Redemption, The (1994)",0.895691
1,356,Forrest Gump (1994),0.860689
2,296,Pulp Fiction (1994),0.84941
3,2571,"Matrix, The (1999)",0.826536
4,593,"Silence of the Lambs, The (1991)",0.822105
5,260,Star Wars: Episode IV - A New Hope (1977),0.812395
6,2959,Fight Club (1999),0.794219
7,527,Schindler's List (1993),0.787754
8,1196,Star Wars: Episode V - The Empire Strikes Back...,0.779334
9,50,"Usual Suspects, The (1995)",0.777683


### 3. weighted rating

**Weighted Rating (WR) = (v/(v+m)R)+(m/(v+m)C)**

where,
- v is the number of votes for the movie
- m is the minimum votes required to be listed in the chart
- R is the average rating of the movie
- C is the mean vote across the whole

In [10]:
test_df['rating_normalized'].quantile(0.25)

0.5111111111111111

In [11]:
# weighted rating without normalizing data

test_df = ratings_df.groupby('movieId').agg({'rating':'mean','userId':'count'})

# Define the minimum votes required (m)
# Adjust this value based on your preference or dataset characteristics
m = 10

# the mean vote across the whole dataset
C = test_df['rating'].mean()

# Function to calculate weighted rating
def calculate_weighted_rating(row):
    v = row['userId']
    R = row['rating']
    if v >= m:
        weighted_rating = (v / (v + m)) * R + (m / (v + m)) * C
    else:
        weighted_rating = R
    return weighted_rating

# Apply the calculate_weighted_rating function to each row and add the result as a new column
test_df['weighted_rating'] = test_df.apply(calculate_weighted_rating, axis=1)

# Print the DataFrame with the new 'weighted_rating' column
test_df = test_df.sort_values('weighted_rating',ascending=False).reset_index()

# get movie title with highest weighted rating
movies_df[['movieId','title']].merge(test_df[['movieId','weighted_rating']], how='right').head(10)

Unnamed: 0,movieId,title,weighted_rating
0,8738,"Woman Is a Woman, A (femme est une femme, Une)...",5.0
1,3851,I'm the One That I Want (2000),5.0
2,3951,Two Family House (2000),5.0
3,128087,Trinity and Sartana Are Coming (1972),5.0
4,3942,Sorority House Massacre II (1990),5.0
5,3941,Sorority House Massacre (1986),5.0
6,3940,Slumber Party Massacre III (1990),5.0
7,467,Live Nude Girls (1995),5.0
8,3939,Slumber Party Massacre II (1987),5.0
9,27751,'Salem's Lot (2004),5.0


## Function : Top 10 movies popularity based

In [12]:
def top_n_pop_based(n):
    
    # top_n is : normalizes data and different weight(2)
    top_n_movies = top_n.head(n)
    
    return top_n_movies

In [13]:
top_n_pop_based(10)

Unnamed: 0,movieId,title,final_rating
0,318,"Shawshank Redemption, The (1994)",0.895691
1,356,Forrest Gump (1994),0.860689
2,296,Pulp Fiction (1994),0.84941
3,2571,"Matrix, The (1999)",0.826536
4,593,"Silence of the Lambs, The (1991)",0.822105
5,260,Star Wars: Episode IV - A New Hope (1977),0.812395
6,2959,Fight Club (1999),0.794219
7,527,Schindler's List (1993),0.787754
8,1196,Star Wars: Episode V - The Empire Strikes Back...,0.779334
9,50,"Usual Suspects, The (1995)",0.777683
