In [7]:
import requests
import random
import pandas as pd
import ast
from pandas_profiling import ProfileReport


In [9]:
def getUser(randID):
    '''
    int -> dict
    take an integer id and get the user info from the Movie API
    '''
    response=requests.get("http://fall2020-comp598.cs.mcgill.ca:8080/user/"+str(randID))
    return response.json()

In [10]:
def getMovie(ID):
    '''
    int -> dict
    take an integer id and get the Movie info from the Movie API
    '''
    response=requests.get("http://fall2020-comp598.cs.mcgill.ca:8080/movie/"+str(ID))
    return response.json()

## Get users who have rated a movie (from our ratings dataset)

In [13]:
# Import Kafka Stream ratings events into a dataframe

ratings_df=pd.read_csv("ratings_data.csv")


# Create dictionary of unique userIDs and how many ratings each user performed
user_dict=dict()
for i in ratings_df['userID']:
    if i not in user_dict:
        user_dict[i]=1
    else:
        user_dict[i]+=1

In [None]:
# For each userID get their information

users_list=list()
for uid in user_dict:
    users_list.append(getUser(uid))


# Save the user list into a csv file
pd.DataFrame(users_list).to_csv('users_rated.csv')

In [44]:
# Perform dataset analysis using Pandas Profiling 


df_rated_users=pd.DataFrame(users_list)

profile = ProfileReport(df_rated_users, "User Reporting")

profile.to_widgets()

profile.to_file("users_report.html")

## Get All Movies
After informal API tests using Postman, it was determined that there were at 9019 movies total.


In [None]:
# Get all movies from Movie API

movie_rand=[]
count = 0
random_dict=set()
while count<=9019:
    if count%500==0:
        print(count)
#     rand=random.randint(1,9019)
#     while rand in random_dict:
#         rand = random.randint(1,9019)
#     random_dict.add(rand)
    movie=getMovie(count)
    movie_rand.append(movie)
    count+=1

df = pd.DataFrame(movie_rand)
    
# Get a list of all genres and their IDs
genre_dict = dict()
for movie in df['genres']:
    try:
        for genre in movie:
            ID=genre.get('id')
            name=genre.get('name')
            if ID not in genre_dict:
                genre_dict[ID]=name
    except TypeError:
        print('This record is empty')


# Make a dictionary of the genres all equal to 0 to be used for boolean indicator of a movie's genre
# i.e. {'animation':1,'action':0, ...}

movie_genres=dict()
for i in genre_dict.keys():
    movie_genres[i]=0
    
    

In [77]:
# for each movie, makes a copy of empty genre dict and updates dict accordingly
# i.e. if df['genre'][i] --> [{16:'animation'}] then the copied genre dict[16] for Movie i becomes 1

total_list=[]
for movie in ((df['genres'])):
    if type(movie) is float:
#         check for null movie entries
        temp_dict=movie_genres.copy()
        total_list.append(temp_dict)
    else:
        temp_dict=movie_genres.copy()
        for genre in movie:
            temp_dict[genre.get('id')]=1
        total_list.append(temp_dict)
      
    
        
# Make the genres its own dataframe (the movie indices are preserved)
# Retitle the columns by their genre name and not ID
# Concatonate it to the original movie dataframe

genres_df=pd.DataFrame(total_list)
genres_df=genres_df.rename(columns=genre_dict)
total_movies=pd.concat([df, genres_df],axis=1, sort=False)
total_movies.to_csv('movies_all.csv')

In [78]:
movie_df=pd.read_csv('movies.csv')
movie_df_temp = movie_df[['id','Unnamed: 0']]

## Update the ratings dataset with its numerical ID (for accessing API)

In [None]:
merged_ratings = pd.merge(left=ratings_df, right=movie_df_temp, how='left', left_on='movie',right_on='id')

In [5]:
u_df=pd.read_csv('usersA.csv')

In [6]:
u_df[:8958]

Unnamed: 0.1,Unnamed: 0,message,user_id,age,occupation,gender
0,0,user not found,,,,
1,1,,1.0,34.0,sales/marketing,M
2,2,,2.0,33.0,college/grad student,M
3,3,,3.0,29.0,scientist,M
4,4,,4.0,30.0,other or not specified,M
5,5,,5.0,26.0,scientist,M
6,6,,6.0,27.0,college/grad student,F
7,7,,7.0,31.0,sales/marketing,M
8,8,,8.0,30.0,college/grad student,M
9,9,,9.0,24.0,college/grad student,M
