In [163]:
import pandas as pd
from collections import defaultdict
from surprise import Reader
from surprise import Dataset

In [174]:
class MovieLens:
    movieIDtoName = {}
    nameToMovieID = {}
    
    ratingsDF = pd.read_csv('Dataset/ratings.csv')
    moviesDF = pd.read_csv('Dataset/movies.csv')
    
    ratingsData = ratingsDF.values
    moviesData = moviesDF.values
       
    def loadMovieLensLatestSmall(self):
        ratingsDataset = 0
        self.movieIDtoName = {}
        self.nameToMovieID = {}
        
        reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)
        ratingsDataset = Dataset.load_from_file("Dataset/ratings.csv", reader=reader)
        
        for row in self.moviesData:
            movieID = row[0]
            movieName = row[1]
            self.movieIDtoName[movieID] = movieName
            self.nameToMovieID[movieName] = movieID
        return ratingsDataset
            

    def getUserRatings(self, user):
        userRatings = []
        hitUser = False
        for row in self.ratingsData:
            userID = row[0]
            if(user == userID):
                movieID = row[1]
                rating = row[2]
                userRatings.append((movieID, rating))
                hitUser = True
            if (hitUser and (user != userID)):
                break
        return userRatings

    def getPopularityRanks(self):
        ratings = defaultdict(int)
        rankings = defaultdict(int)
        
        for row in self.ratingsData:
            movieID = row[1]
            ratings[movieID] += 1 #no. of ratings given to a movie
        
        #giving ranks based on ratings
        rank = 1
        for movieID,ratingCount in sorted(ratings.items(),key=lambda x:x[1],reverse = True):
            rankings[movieID] = rank
            rank += 1
        return rankings
    
    def getGenres(self):
        genres = defaultdict(list)
        genresIDs = {}
        maxGenreID = 0
        
        for row in self.moviesData:
            movieID = row[0]
            genreList = row[2].split('|')
            genreIDList = []
            for genre in genreList:
                if genre in genresIDs:
                    genreID = genresIDs[genre]
                else:
                    genreID = maxGenreID
                    genresIDs[genre] = genreID
                    maxGenreID += 1
                genreIDList.append(genreID)
            genres[movieID] = genreIDList
            
        # Convert integer-encoded genre lists to bitfields that we can treat as vectors
        for (movieID, genreIDList) in genres.items():
            bitfield = [0] * maxGenreID
            for genreID in genreIDList:
                bitfield[genreID] = 1
            genres[movieID] = bitfield
                
        return genres
    
    def getYears(self):
        years = defaultdict(int)
        
        for row in self.moviesData:
            movieID = row[0]
            title = row[1]
            year = title[-5:-1]
            print(year)
            if(year):
                years[movieID] = int(year)
        return years    
    
    def getMovieID(self,movieName):
        if movieName in self.nameToMovieID:
            return self.nameToMovieID[movieName]
        else:
            return 0
    def getMovieName(self,movieID):
        if movieID in self.movieIDtoName:
            return self.movieIDtoName[movieID]
        else:
            return ""

In [158]:
ml.ratingsDF.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [159]:
ml.moviesDF.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
