In [100]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

%matplotlib inline

Importing Data

In [101]:
data = pd.read_csv('ml-100k\\u.data', names=["user id", "item id","rating", "timestamp"], sep="\t")
data.head()

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [102]:
item = pd.read_csv('ml-100k\\u.item', names=["movie id", "movie title", "release date", "video release date", "IMDb URL", "unknown", "Action", "Adventure", "Animation", "Childrens", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"], sep='|', encoding="latin-1")
item.head()

Unnamed: 0,movie id,movie title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Childrens,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [103]:
user = pd.read_csv('ml-100k\\u.user', names=["user id", "age", "gender", "occupation", "zip code"], sep='|')
user.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


Cleaning Data

        Need to learn clearning data more in depth. For now:
            1. Dropping NaN items but need to learn how to clean NaN data.
            2. Converting dtypes of columns to necessary dtypes

In [104]:
data['rating'].isna().value_counts()

False    100000
Name: rating, dtype: int64

In [105]:
user['age'].isna().value_counts()

False    943
Name: age, dtype: int64

In [106]:
user['gender'].isna().value_counts()

False    943
Name: gender, dtype: int64

In [107]:
user['occupation'].isna().value_counts()

False    943
Name: occupation, dtype: int64

In [108]:
item['release date'].isna().value_counts()

False    1681
True        1
Name: release date, dtype: int64

In [109]:
data.isna().sum()

user id      0
item id      0
rating       0
timestamp    0
dtype: int64

In [110]:
user.isna().sum()

user id       0
age           0
gender        0
occupation    0
zip code      0
dtype: int64

In [111]:
item.isna().sum()

movie id                 0
movie title              0
release date             1
video release date    1682
IMDb URL                 3
unknown                  0
Action                   0
Adventure                0
Animation                0
Childrens                0
Comedy                   0
Crime                    0
Documentary              0
Drama                    0
Fantasy                  0
Film-Noir                0
Horror                   0
Musical                  0
Mystery                  0
Romance                  0
Sci-Fi                   0
Thriller                 0
War                      0
Western                  0
dtype: int64

In [112]:
item.drop(['video release date'], axis=1, inplace=True)
item = item.dropna()

In [113]:
data.dtypes

user id      int64
item id      int64
rating       int64
timestamp    int64
dtype: object

In [114]:
user.dtypes

user id        int64
age            int64
gender        object
occupation    object
zip code      object
dtype: object

In [115]:
item.dtypes

movie id         int64
movie title     object
release date    object
IMDb URL        object
unknown          int64
Action           int64
Adventure        int64
Animation        int64
Childrens        int64
Comedy           int64
Crime            int64
Documentary      int64
Drama            int64
Fantasy          int64
Film-Noir        int64
Horror           int64
Musical          int64
Mystery          int64
Romance          int64
Sci-Fi           int64
Thriller         int64
War              int64
Western          int64
dtype: object

In [116]:
item['release date'] = pd.to_datetime(item['release date'], format='%d-%b-%Y')
item.dtypes

movie id                 int64
movie title             object
release date    datetime64[ns]
IMDb URL                object
unknown                  int64
Action                   int64
Adventure                int64
Animation                int64
Childrens                int64
Comedy                   int64
Crime                    int64
Documentary              int64
Drama                    int64
Fantasy                  int64
Film-Noir                int64
Horror                   int64
Musical                  int64
Mystery                  int64
Romance                  int64
Sci-Fi                   int64
Thriller                 int64
War                      int64
Western                  int64
dtype: object

Visualisations

1. Genre change cycle over the years

In [131]:
# manipulating item dataframe to extract year part
year = []

for x in item['release date']:
    year.append(x.year)
    
item['Year'] = year
item = item.sort_values(by='Year', ascending=False)

In [151]:
item_by_year = item[item.columns[5 : ]].groupby('Year')
item_by_year.first()

Unnamed: 0_level_0,Action,Adventure,Animation,Childrens,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1922,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
1926,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1930,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1931,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0
1932,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1994,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1995,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
