# Date Night Movie

### Performing analysis on movie data, picking a date night movie depending on the user choices

In [1]:
import os
import pandas as pd

In [2]:
def get_movie_data():
    
    unames = ['user_id','gender','age','occupation','zip']
    users = pd.read_table(os.path.join('../data/movie','users.dat'), encoding="unicode_escape",
                          sep='::', header=None, names=unames)
    
    rnames = ['user_id', 'movie_id', 'rating', 'timestamp']
    ratings = pd.read_table(os.path.join('../data/movie', 'ratings.dat'), encoding="unicode_escape",
                            sep='::', header=None, names=rnames)
    
    mnames = ['movie_id', 'title','genres']
    movies = pd.read_table(os.path.join('../data/movie', 'movies.dat'), encoding="unicode_escape",
                           sep='::', header=None, names=mnames)

    return users, ratings, movies

In [3]:
users, ratings, movies = get_movie_data()

  return func(*args, **kwargs)


In [4]:
print (users.head())

   user_id gender  age  occupation    zip
0        1      F    1          10  48067
1        2      M   56          16  70072
2        3      M   25          15  55117
3        4      M   45           7  02460
4        5      M   25          20  55455


In [5]:
print (ratings.head())

   user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291


In [6]:
print (movies.head())

   movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                        Comedy


##### Clean up the `movies`

- Get the `year`
- Shorten the `title`


In [7]:
tmp = movies.title.str.extract('(.*) \(([0-9]+)\)')
tmp.apply(lambda x:x[0] if len(x) > 0 else None)
tmp.apply(lambda x: x[0][:40] if len(x) > 0 else None)

0    Toy Story
1         1995
dtype: object

In [8]:
movies['year'] = tmp[1]
movies['short_title'] = tmp[0]

In [9]:
print (movies.head())

   movie_id                               title                        genres  \
0         1                    Toy Story (1995)   Animation|Children's|Comedy   
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy   
2         3             Grumpier Old Men (1995)                Comedy|Romance   
3         4            Waiting to Exhale (1995)                  Comedy|Drama   
4         5  Father of the Bride Part II (1995)                        Comedy   

   year                  short_title  
0  1995                    Toy Story  
1  1995                      Jumanji  
2  1995             Grumpier Old Men  
3  1995            Waiting to Exhale  
4  1995  Father of the Bride Part II  


In [10]:
#merging all the three datasets (ratings, users and movies) into one dataframe.
r_u= pd.merge(ratings,users, on='user_id')
r_u_m=pd.merge(r_u,movies,on='movie_id')
r_u_m

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,year,short_title
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
1,2,1193,5,978298413,M,56,16,70072,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
4,17,1193,5,978158471,M,50,1,95350,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
...,...,...,...,...,...,...,...,...,...,...,...,...
1000204,5949,2198,5,958846401,M,18,17,47901,Modulations (1998),Documentary,1998,Modulations
1000205,5675,2703,3,976029116,M,35,14,30030,Broken Vessels (1998),Drama,1998,Broken Vessels
1000206,5780,2845,1,958153068,M,18,17,92886,White Boys (1999),Drama,1999,White Boys
1000207,5851,3607,5,957756608,F,18,20,55410,One Little Indian (1973),Comedy|Drama|Western,1973,One Little Indian


In [11]:
r_u_m['movie_id'].value_counts()

2858    3428
260     2991
1196    2990
1210    2883
480     2672
        ... 
3458       1
2226       1
1815       1
398        1
2909       1
Name: movie_id, Length: 3706, dtype: int64

In [12]:
#pd.set_option('max_rows', None)

In [13]:
avg_rating=r_u_m.groupby(['short_title','movie_id']).mean().sort_values(by='rating',ascending = False)
avg_rating

Unnamed: 0_level_0,Unnamed: 1_level_0,user_id,rating,timestamp,age,occupation
short_title,movie_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ulysses (Ulisse),3172,3485.0,5.0,967060437.0,25.0,0.0
Schlafes Bruder (Brother of Sleep),989,1915.0,5.0,974693867.0,50.0,13.0
Smashing Time,3233,3733.0,5.0,966424573.5,47.5,3.5
One Little Indian,3607,5851.0,5.0,957756608.0,18.0,20.0
"Baby, The",3280,46.0,5.0,977979487.0,18.0,19.0
...,...,...,...,...,...,...
"Fantastic Night, The (La Nuit Fantastique)",3376,3610.0,1.0,974825679.0,18.0,6.0
Cheetah,2039,2106.0,1.0,974756684.0,18.0,20.0
Torso (Corpi Presentano Tracce di Violenza Carnale),3493,3022.5,1.0,969641428.0,9.5,7.0
Mutters Courage,655,2315.0,1.0,974478452.0,56.0,7.0


In [14]:
r_u_m['rating'].value_counts()

4    348971
3    261197
5    226310
2    107557
1     56174
Name: rating, dtype: int64

In [15]:
import numpy as np

### With the help of numpy and aggregate function of groupby in pandas, calculating the mean rating and the size of the rating in the dataset.

In [16]:
movie_stats = r_u_m.groupby('title').agg({'rating': [np.size, np.mean]})
movie_stats

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"$1,000,000 Duck (1971)",37,3.027027
'Night Mother (1986),70,3.371429
'Til There Was You (1997),52,2.692308
"'burbs, The (1989)",303,2.910891
...And Justice for All (1979),199,3.713568
...,...,...
"Zed & Two Noughts, A (1985)",29,3.413793
Zero Effect (1998),301,3.750831
Zero Kelvin (Kjærlighetens kjøtere) (1995),2,3.500000
Zeus and Roxanne (1997),23,2.521739


In [17]:
data_sort=movie_stats.sort_values(by=('rating','mean'),ascending = False)
data_sort

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Ulysses (Ulisse) (1954),1,5.0
Lured (1947),1,5.0
Follow the Bitch (1998),1,5.0
Bittersweet Motel (2000),1,5.0
Song of Freedom (1936),1,5.0
...,...,...
"Fantastic Night, The (La Nuit Fantastique) (1949)",1,1.0
Cheetah (1989),1,1.0
Torso (Corpi Presentano Tracce di Violenza Carnale) (1973),2,1.0
Mutters Courage (1995),1,1.0


In [18]:
# cheching any null values are present or not
data_sort.isnull().values.any()

False

In [19]:
data_sort.isnull().sum()

rating  size    0
        mean    0
dtype: int64

### From the above data it is clear that there are 10 movies with highest rating for a movie i.e., 5 . But the size of rating for all these movies are very less (less than 4 ratings). So lets give a minimum number of ratings as 500 to the above list.

In [20]:
rat_500=data_sort.loc[data_sort[('rating','size')]>=500]
rat_500

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),628,4.560510
"Shawshank Redemption, The (1994)",2227,4.554558
"Godfather, The (1972)",2223,4.524966
"Close Shave, A (1995)",657,4.520548
"Usual Suspects, The (1995)",1783,4.517106
...,...,...
Superman III (1983),511,2.336595
Judge Dredd (1995),564,2.308511
Batman & Robin (1997),606,2.257426
Congo (1995),565,2.238938


In [21]:
rat_500.head(1)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954),628,4.56051


### Recommended movie with highest rating is **'Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)'**

In [22]:
#extracting the Unique genres in the data
r_u_m.genres.unique()

array(['Drama', "Animation|Children's|Musical", 'Musical|Romance',
       "Animation|Children's|Comedy", 'Action|Adventure|Comedy|Romance',
       'Action|Adventure|Drama', 'Comedy|Drama',
       "Adventure|Children's|Drama|Musical", 'Musical', 'Comedy',
       "Animation|Children's", 'Comedy|Fantasy', 'Animation',
       'Comedy|Sci-Fi', 'Drama|War', 'Romance',
       "Animation|Children's|Musical|Romance",
       "Children's|Drama|Fantasy|Sci-Fi", 'Drama|Romance',
       'Animation|Comedy|Thriller',
       "Adventure|Animation|Children's|Comedy|Musical",
       "Animation|Children's|Comedy|Musical", 'Thriller',
       'Action|Crime|Romance', 'Action|Adventure|Fantasy|Sci-Fi',
       "Children's|Comedy|Musical", 'Action|Drama|War',
       "Children's|Drama", 'Crime|Drama|Thriller', 'Action|Crime|Drama',
       'Action|Adventure|Mystery', 'Crime|Drama',
       'Action|Adventure|Sci-Fi|Thriller',
       'Action|Adventure|Romance|Sci-Fi|War', 'Action|Thriller',
       'Action|Drama', 'Co

## Filtering out the ratings with respect to age group (18-35)

In [23]:
r_u_m_1=r_u_m.loc[(r_u_m['age']>18) & (r_u_m['age']<35)]
r_u_m_1

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,year,short_title
2,12,1193,4,978220179,M,25,12,32793,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
3,15,1193,4,978199279,M,25,7,22903,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
7,24,1193,5,978136709,F,25,7,10023,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
8,28,1193,3,978125194,F,25,1,14607,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
11,42,1193,3,978038981,M,25,8,24502,One Flew Over the Cuckoo's Nest (1975),Drama,1975,One Flew Over the Cuckoo's Nest
...,...,...,...,...,...,...,...,...,...,...,...,...
1000190,5543,404,3,960127592,M,25,17,97401,Brother Minister: The Assassination of Malcolm...,Documentary,1994,Brother Minister: The Assassination of Malcolm X
1000191,5220,2543,3,961546137,M,25,7,91436,Six Ways to Sunday (1997),Comedy,1997,Six Ways to Sunday
1000194,5795,591,1,958145253,M,25,1,92688,Tough and Deadly (1995),Action|Drama|Thriller,1995,Tough and Deadly
1000196,5328,2438,4,960838075,F,25,4,91740,Outside Ozona (1998),Drama|Thriller,1998,Outside Ozona


## Extracting the data for the given genre preferences

In [24]:
# choosing some differnt genres as the user choices for the movies
g = r_u_m_1.loc[r_u_m_1['genres'].str.contains('Comedy' and 'Drama' and 'Horror' and 'Sci-Fi' and 'Action')]
g

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip,title,genres,year,short_title
5905,3,1197,5,978297570,M,25,15,55117,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance,1987,"Princess Bride, The"
5907,11,1197,5,978903297,F,25,1,04093,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance,1987,"Princess Bride, The"
5912,24,1197,4,978132232,F,25,7,10023,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance,1987,"Princess Bride, The"
5913,28,1197,5,978125233,F,25,1,14607,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance,1987,"Princess Bride, The"
5917,36,1197,4,978210557,M,25,3,94123,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance,1987,"Princess Bride, The"
...,...,...,...,...,...,...,...,...,...,...,...,...
1000049,3314,3443,3,967947206,M,25,7,06516,Born American (1986),Action|Drama|Thriller,1986,Born American
1000052,6024,3443,4,956749779,M,25,12,53705,Born American (1986),Action|Drama|Thriller,1986,Born American
1000117,5717,2258,4,958509389,M,25,0,03766,Master Ninja I (1984),Action,1984,Master Ninja I
1000182,4874,624,4,962781918,F,25,4,70808,Condition Red (1995),Action|Drama|Thriller,1995,Condition Red


## Seperating male and female ratings from the data frame

In [25]:
male_choices = g.loc[g['gender'] == 'M']

In [26]:
female_choices = g.loc[g['gender'] == 'F']

### extracting the highest rated movie from the above dataset

In [27]:
g_max = g.groupby('title').agg({'rating': [np.size, np.mean]})
g_max_sort=g_max.sort_values(by=('rating','mean'),ascending = False)
g_max_sort

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Spiders, The (Die Spinnen, 1. Teil: Der Goldene See) (1919)",1,5.000000
Sanjuro (1962),26,4.615385
Raiders of the Lost Ark (1981),987,4.578521
"Godfather, The (1972)",817,4.574051
Star Wars: Episode IV - A New Hope (1977),1128,4.572695
...,...,...
3 Ninjas: High Noon On Mega Mountain (1998),11,1.363636
Time Tracers (1995),3,1.333333
Turbo: A Power Rangers Movie (1997),19,1.263158
In the Line of Duty 2 (1987),1,1.000000


### selecting minimum number of ratings as 1000

In [28]:
g_max_1000=g_max_sort.loc[g_max_sort[('rating','size')]>=1000]
g_max_1000

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Star Wars: Episode IV - A New Hope (1977),1128,4.572695
Star Wars: Episode V - The Empire Strikes Back (1980),1176,4.390306
"Matrix, The (1999)",1049,4.389895
Saving Private Ryan (1998),1017,4.291052
Star Wars: Episode VI - Return of the Jedi (1983),1134,4.058201
Terminator 2: Judgment Day (1991),1087,4.024839
Jurassic Park (1993),1000,3.702


In [29]:
g_max_1000.head(1)

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
Star Wars: Episode IV - A New Hope (1977),1128,4.572695


### Good rated movie for date night based on the user choices is 'Star Wars: Episode IV - A New Hope (1977)'.