In [2]:
import pandas as pd


In [92]:
#import of review data
cols = ["user id","item id","rating","timestamp"]
#encoding using ISO-8859-1 is used because utf-8 does not support all the characters in movie names
df_data = pd.read_csv("ml-100k/u.data",sep="\t",names=cols,header=None,encoding="ISO-8859-1")

In [93]:
#verifying the sucessful import of review data
print(df_data.head())

   user id  item id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596


In [94]:
#import of moviedata
cols = ["movie id",
        "movie title",
        "release date",
        "video release date",
        "IMDb URL","unknown",
        "Action",
        "Adventure",
        "Animation",
        "Children's",
        "Comedy",
        "Crime",
        "Documentary",
        "Drama",
        "Fantasy",
        "Film-Noir",
        "Horror",
        "Musical",
        "Mystery",
        "Romance",
        "Sci-Fi",
        "Thriller",
        "War",
        "Western"]

df_movie = pd.read_csv("ml-100k/u.item",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [95]:
#verifying the sucessful import of movie data
print(df_movie.head())

   movie id        movie title release date  video release date  \
0         1   Toy Story (1995)  01-Jan-1995                 NaN   
1         2   GoldenEye (1995)  01-Jan-1995                 NaN   
2         3  Four Rooms (1995)  01-Jan-1995                 NaN   
3         4  Get Shorty (1995)  01-Jan-1995                 NaN   
4         5     Copycat (1995)  01-Jan-1995                 NaN   

                                            IMDb URL  unknown  Action  \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...        0       0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...        0       1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...        0       0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...        0       1   
4  http://us.imdb.com/M/title-exact?Copycat%20(1995)        0       0   

   Adventure  Animation  Children's  ...  Fantasy  Film-Noir  Horror  Musical  \
0          0          1           1  ...        0          0       0        0

In [96]:
#import of user data
cols = ["user id","age","gender","occupation","zip code"]
df_user = pd.read_csv("ml-100k/u.user",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [97]:
#verifying the sucessful import of user data
print(df_user.head())

   user id  age gender  occupation zip code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213


In [98]:
#frequency binning the ages into age groups as it will be easier for future analysis
df_user['age_group'] = pd.qcut(df_user['age'],q=10,precision=0)

#the bins are of unequal size due to repeating values in a bin
df_user['age_group'].value_counts()

(6.0, 20.0]     109
(23.0, 26.0]    105
(35.0, 40.0]    100
(31.0, 35.0]     98
(29.0, 31.0]     96
(40.0, 46.0]     94
(46.0, 51.0]     93
(20.0, 23.0]     92
(51.0, 73.0]     85
(26.0, 29.0]     71
Name: age_group, dtype: int64

In [99]:
df_movie.drop(["movie id",
               "movie title",
               "release date",
               "video release date",
               "IMDb URL",
               "unknown"],axis=1).sum(axis = 0, skipna = True)

Action         251
Adventure      135
Animation       42
Children's     122
Comedy         505
Crime          109
Documentary     50
Drama          725
Fantasy         22
Film-Noir       24
Horror          92
Musical         56
Mystery         61
Romance        247
Sci-Fi         101
Thriller       251
War             71
Western         27
dtype: int64

In [100]:
df = pd.merge(pd.merge(df_data,
                  df_user[["user id",
                           "age",
                           "gender",
                           "occupation"]],
                  on='user id',
                  how='left'),
              df_movie.drop(['IMDb URL'],axis=1),
              left_on = 'item id',
              right_on = 'movie id',
              how ='left')
              

In [101]:
df.head()

Unnamed: 0,user id,item id,rating,timestamp,age,gender,occupation,movie id,movie title,release date,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,49,M,writer,242,Kolya (1996),24-Jan-1997,...,0,0,0,0,0,0,0,0,0,0
1,186,302,3,891717742,39,F,executive,302,L.A. Confidential (1997),01-Jan-1997,...,0,1,0,0,1,0,0,1,0,0
2,22,377,1,878887116,25,M,writer,377,Heavyweights (1994),01-Jan-1994,...,0,0,0,0,0,0,0,0,0,0
3,244,51,2,880606923,28,M,technician,51,Legends of the Fall (1994),01-Jan-1994,...,0,0,0,0,0,1,0,0,1,1
4,166,346,1,886397596,47,M,educator,346,Jackie Brown (1997),01-Jan-1997,...,0,0,0,0,0,0,0,0,0,0


In [102]:
def release_year(row):
    return str(row['release date'])[-4:]
    
def release_month(row):
    return str(row['release date'])[3:6]
def release_date(row):
    return str(row['release date'])[:2]

df['release_year'] =  df.apply(lambda row: release_year(row), axis=1)
df['release_month'] =  df.apply(lambda row: release_month(row), axis=1)
df['release_day'] =  df.apply(lambda row: release_date(row), axis=1)

df = df.drop(['release date'])

KeyError: "['release date'] not found in axis"

In [103]:
df[['release_year','release_month','release_day']]

Unnamed: 0,release_year,release_month,release_day
0,1997,Jan,24
1,1997,Jan,01
2,1994,Jan,01
3,1994,Jan,01
4,1997,Jan,01
...,...,...,...
99995,1996,Sep,14
99996,1985,Jan,01
99997,1993,Jan,01
99998,1996,Nov,27


In [120]:
df_job_genre = df[['occupation',
                   "Action",
                   "Adventure",
                   "Animation",
                   "Children's",
                   "Comedy",
                   "Crime",
                   "Documentary",
                   "Drama",
                   "Fantasy",
                   "Film-Noir",
                   "Horror",
                   "Musical",
                   "Mystery",
                   "Romance",
                   "Sci-Fi",
                   "Thriller",
                   "War",
                   "Western"]]


df_job_genre = df_job_genre.groupby(['occupation']).sum()

In [121]:
df_job_genre

Unnamed: 0_level_0,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
administrator,1858,1020,203,462,2203,565,48,3099,94,147,391,354,385,1539,942,1570,786,168
artist,528,311,121,202,617,201,34,957,24,64,102,157,164,462,308,476,225,29
doctor,110,57,22,28,168,41,13,238,7,9,18,27,27,133,56,117,44,5
educator,1962,1128,263,593,2708,710,107,4281,112,202,421,481,532,2006,986,1767,1005,189
engineer,2277,1299,363,629,2438,611,51,3153,125,145,414,441,363,1467,1221,1712,858,180
entertainment,499,249,62,104,574,207,23,804,25,47,192,68,132,342,289,554,174,27
executive,808,436,91,232,951,287,26,1407,50,58,155,131,203,592,391,811,312,49
healthcare,577,323,89,213,722,200,45,1297,38,50,120,126,140,499,266,563,230,47
homemaker,92,41,9,25,93,25,2,104,3,4,9,15,32,59,30,95,28,0
lawyer,283,142,35,83,460,107,10,543,14,36,60,72,78,272,141,247,142,35


In [117]:
df_job_genre = df_job_genre.idxmax(axis=1)


TypeError: reduction operation 'argmax' not allowed for this dtype

In [118]:
df_job_genre

occupation
administrator     Drama
artist            Drama
doctor            Drama
educator          Drama
engineer          Drama
entertainment     Drama
executive         Drama
healthcare        Drama
homemaker         Drama
lawyer            Drama
librarian         Drama
marketing         Drama
none             Action
other             Drama
programmer        Drama
retired           Drama
salesman          Drama
scientist         Drama
student           Drama
technician        Drama
writer            Drama
dtype: object