In [2]:
import pandas as pd
import numpy as np
from IPython.display import display

#import original data folder
data_folder = "C:/Users/Can/Desktop/dsai/recsys/groupp"

#preparing movies
movies = pd.read_csv(data_folder + "/movie.csv")
movies_df = movies.rename(columns={'movieId': 'movie_id', 'title': 'title', 'genres': 'genres'})
movies_df = movies_df[['movie_id', 'title', 'genres']]
display(movies_df)

#preparing ratings
ratings = pd.read_csv(data_folder + "/rating.csv")
ratings_df = ratings.rename(columns={'userId': 'user_id', 'movieId': 'movie_id', 'rating': 'rating', 'timestamp': 'timestamp'})
ratings_df = ratings_df[['user_id', 'movie_id', 'rating']]
display(ratings_df)

#preparing movies_tags
genome_tags = pd.read_csv(data_folder + "/genome_tags.csv")
genome_tags_df = genome_tags.rename(columns={'tagId': 'tag_id', 'tag': 'tag'})
movies_tags = pd.read_csv(data_folder + "/genome_scores.csv")
movies_tags_df = movies_tags.rename(columns={'movieId': 'movie_id', 'tagId': 'tag_id', 'relevance': 'relevance'})

movies_tags_df.insert(2, "tag", genome_tags_df["tag"])
display(movies_tags_df)


Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


Unnamed: 0,user_id,movie_id,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5
...,...,...,...
20000258,138493,68954,4.5
20000259,138493,69526,4.5
20000260,138493,69644,3.0
20000261,138493,70286,5.0


Unnamed: 0,movie_id,tag_id,tag,relevance
0,1,1,007,0.02500
1,1,2,007 (series),0.02500
2,1,3,18th century,0.05775
3,1,4,1920s,0.09675
4,1,5,1930s,0.14675
...,...,...,...,...
11709763,131170,1124,,0.58775
11709764,131170,1125,,0.01075
11709765,131170,1126,,0.01575
11709766,131170,1127,,0.11450


In [3]:
movies_df = movies_df.drop_duplicates(subset='title', keep='first')

#join ratings to genres
ratings_genres_df = pd.merge(movies_df, ratings_df,  how='inner', left_on=['movie_id'], right_on = ['movie_id'])
ratings_genres_df = ratings_genres_df[['movie_id', 'user_id', 'rating', 'title', 'genres']]

#keep only users and genres
users_genres = ratings_genres_df.loc[:, ['user_id', 'genres']]

#new users only dataframe
users_df = users_genres.drop_duplicates(subset=['user_id'], keep='first')
users_df = users_df['user_id']
users_df = pd.DataFrame(users_df)
users_df = users_df.set_index('user_id')
users_df = users_df.sort_index(ascending=True)

users_genres = users_genres.set_index('user_id')
users_genres = users_genres.sort_values(by="user_id", ascending=True)

display(users_genres)
display(users_df)


Unnamed: 0_level_0,genres
user_id,Unnamed: 1_level_1
1,Horror|Mystery|Thriller
1,Comedy|Fantasy|Romance
1,Horror|Thriller
1,Adventure|Children|Fantasy
1,Drama|Sci-Fi
...,...
138493,Adventure|Animation|Children|Fantasy|Musical
138493,Drama|Romance|War
138493,Action|Adventure
138493,Action|Comedy|Crime|Fantasy


1
2
3
4
5
...
138489
138490
138491
138492
138493


In [4]:
def count_rated_genres(user_id, df):

    users_genre = df.loc[user_id,:]
    rated_genres = set()

    for genre in users_genre.genres:
        rated_genres.update(genre.split('|'))

    return len(rated_genres)

users_rated_genres = []
for user in users_df.index:
     users_rated_genres.append(count_rated_genres(user, users_genres))


In [5]:
users_rated_genres_df = pd.DataFrame({
    'user_id' : users_df.index.tolist(),
    'rated_genres' : users_rated_genres
})

display(users_rated_genres_df)


mean_rated_genres = np.mean(users_rated_genres)
print(mean_rated_genres)
users_rated_genres_df = users_rated_genres_df[users_rated_genres_df['rated_genres'] > mean_rated_genres]

display(users_rated_genres_df)


Unnamed: 0,user_id,rated_genres
0,1,17
1,2,18
2,3,18
3,4,15
4,5,17
...,...,...
138488,138489,17
138489,138490,18
138490,138491,14
138491,138492,17


16.379679839414266


Unnamed: 0,user_id,rated_genres
0,1,17
1,2,18
2,3,18
4,5,17
6,7,18
...,...,...
138486,138487,17
138488,138489,17
138489,138490,18
138491,138492,17


In [6]:
count_ratings = ratings_df.groupby(['user_id']).count()
selected = count_ratings['rating'] > count_ratings['rating'].mean()
expert_users = count_ratings.loc[selected]
expert_users = pd.DataFrame(expert_users)
expert_users = expert_users.reset_index()
expert_users = expert_users.merge(users_rated_genres_df, how='inner', left_on='user_id', right_on='user_id')
expert_users = expert_users.drop(['movie_id'], axis=1)
expert_users.rename(columns={'rating' : 'num_ratings'}, inplace=True)

display(expert_users)


Unnamed: 0,user_id,num_ratings,rated_genres
0,1,175,17
1,3,187,18
2,7,276,18
3,11,504,19
4,14,243,17
...,...,...,...
35895,138483,276,19
35896,138484,148,18
35897,138486,193,18
35898,138490,151,18


In [7]:
#users amount
original_users_amount = 0
for i in range(0, ratings_df.shape[0]):
    if i != 0:
        if ratings_df.user_id[i] != ratings_df.user_id[i-1]:
            original_users_amount += 1
    else:
        original_users_amount += 1
print(original_users_amount)
reduced_users_amount = expert_users.shape[0]
print(reduced_users_amount)

#ratings amount
original_ratings_amount = ratings_df.shape[0]
print(original_ratings_amount)
reduced_ratings_amount = 0
for i in range(0, expert_users.shape[0]):
    reduced_ratings_amount += expert_users.num_ratings[i]
print(reduced_ratings_amount)


138493
35900
20000263
13977680


In [8]:
#given user and movie --> predict rate
#kNN with average grade for each genre given by user

from sklearn.neighbors import KNeighborsRegressor

kNN = KNeighborsRegressor(n_neighbors=5)

#Load Data
data_folder = "C:/Users/Can/Desktop/dsai/recsys/groupp"
average_ratings_per_user_for_each_genre_df = pd.read_csv(data_folder + "/data.csv")
data_df = average_ratings_per_user_for_each_genre_df.fillna(0)
data_df = data_df.drop(8041)
display(data_df)

X = data_df.to_numpy()
Y = ratings_df.to_numpy()

kNN.fit(X, Y)

print(kNN.predict())


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/roman/OneDrive/Bureau/UM/BA/Y3/P1/RS/Group Assignment/archive/data.csv'

In [None]:
##filling in data in group table using prediction:

for i in group_users.index.size:
    
    u_id = group_users.at[index[i],'user']
    u_id = u_id.astype(int)
    curr_user = group_users.loc[group_users['user'] == u_id]

    curr_seen_movies = pd.merge(curr_user, ratings_genres_df, how='left', left_on='user', right_on='user')
    curr_unseen_movies = list(pd.concat([movies_df.item, curr_seen_movies.item]).drop_duplicates(keep=False))

    for col in group_users.columns.size:

        mov_id = group_users.columns[col].astype(int)
        if (mov_id in curr_unseen_movies):
            #predict()
            print("ball")
        else:
            mov_index = curr_seen_movies.loc['item' == mov_id]
            group_users.at['user' == u_id, group_users.columns[col]] = mov_index.at['index' == 0, 'rating']
       

In [None]:
##aggregation strategy for a group(fairness):

agg_group = group_users.join(expert_users.set_index('user'), on = 'user')
agg_group = agg_group.sort_values(by = ['num_ratings'], ascending=False)

rec_list = []
for i in agg_group.index.size:
    temp_mov_id = agg_group.at[index[i],columns[i]]
    rec_list.append(movies_df.at[(temp_mov_id-1),'title'])

print(rec_list)