In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import numpy as np
#from sklearn import tree


In [2]:
#import of review data
cols = ["user id"," movie id ","rating","timestamp"]
#encoding using ISO-8859-1 is used because utf-8 does not support all the characters in movie names
df_data = pd.read_csv("ml-100k/u.data",sep="\t",names=cols,header=None,encoding="ISO-8859-1")

In [3]:
#verifying the sucessful import of review data
print(df_data.head())

   user id   movie id   rating  timestamp
0      196         242       3  881250949
1      186         302       3  891717742
2       22         377       1  878887116
3      244          51       2  880606923
4      166         346       1  886397596


In [4]:
#import of moviedata
cols = [" movie id "," movie title "," release date "," video release date ","IMDb URL "," unknown ",
        " Action "," Adventure "," Animation ","Children's "," Comedy "," Crime "," Documentary ",
        " Drama "," Fantasy ","Film-Noir "," Horror "," Musical "," Mystery "," Romance "," Sci-Fi ",
        "Thriller "," War "," Western "]

df_movie = pd.read_csv("ml-100k/u.item",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [5]:
#verifying the sucessful import of movie data
print(df_movie.head())

    movie id        movie title   release date    video release date   \
0           1   Toy Story (1995)    01-Jan-1995                   NaN   
1           2   GoldenEye (1995)    01-Jan-1995                   NaN   
2           3  Four Rooms (1995)    01-Jan-1995                   NaN   
3           4  Get Shorty (1995)    01-Jan-1995                   NaN   
4           5     Copycat (1995)    01-Jan-1995                   NaN   

                                           IMDb URL    unknown    Action   \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...          0         0   
1  http://us.imdb.com/M/title-exact?GoldenEye%20(...          0         1   
2  http://us.imdb.com/M/title-exact?Four%20Rooms%...          0         0   
3  http://us.imdb.com/M/title-exact?Get%20Shorty%...          0         1   
4  http://us.imdb.com/M/title-exact?Copycat%20(1995)          0         0   

    Adventure    Animation   Children's   ...   Fantasy   Film-Noir   \
0            0            

In [6]:
#import of user data
cols = ["user id","age","gender","occupation","zip code"]
df_user = pd.read_csv("ml-100k/u.user",sep="|",names=cols,header=None,encoding="ISO-8859-1")

In [7]:
#verifying the sucessful import of user data
print(df_user.head())

   user id  age gender  occupation zip code
0        1   24      M  technician    85711
1        2   53      F       other    94043
2        3   23      M      writer    32067
3        4   24      M  technician    43537
4        5   33      F       other    15213


In [8]:
#frequency binning the ages into age groups as it will be easier for future analysis
df_user['age_group'] = pd.qcut(df_user['age'],q=10,precision=0)

#the bins are of unequal size due to repeating values in a bin
df_user['age_group'].value_counts()

(6.0, 20.0]     109
(23.0, 26.0]    105
(35.0, 40.0]    100
(31.0, 35.0]     98
(29.0, 31.0]     96
(40.0, 46.0]     94
(46.0, 51.0]     93
(20.0, 23.0]     92
(51.0, 73.0]     85
(26.0, 29.0]     71
Name: age_group, dtype: int64

In [9]:
df_movie=df_movie.drop([" video release date "],axis=1)

In [10]:
df_movie.head()

Unnamed: 0,movie id,movie title,release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [11]:
df_mdata=pd.merge(df_user,df_data)

In [12]:
df_mdata.head()

Unnamed: 0,user id,age,gender,occupation,zip code,age_group,movie id,rating,timestamp
0,1,24,M,technician,85711,"(23.0, 26.0]",61,4,878542420
1,1,24,M,technician,85711,"(23.0, 26.0]",189,3,888732928
2,1,24,M,technician,85711,"(23.0, 26.0]",33,4,878542699
3,1,24,M,technician,85711,"(23.0, 26.0]",160,4,875072547
4,1,24,M,technician,85711,"(23.0, 26.0]",20,4,887431883


In [13]:
df_edata=pd.merge(df_movie,df_mdata)

In [14]:
print(df_edata.head())

    movie id       movie title   release date   \
0           1  Toy Story (1995)    01-Jan-1995   
1           1  Toy Story (1995)    01-Jan-1995   
2           1  Toy Story (1995)    01-Jan-1995   
3           1  Toy Story (1995)    01-Jan-1995   
4           1  Toy Story (1995)    01-Jan-1995   

                                           IMDb URL    unknown    Action   \
0  http://us.imdb.com/M/title-exact?Toy%20Story%2...          0         0   
1  http://us.imdb.com/M/title-exact?Toy%20Story%2...          0         0   
2  http://us.imdb.com/M/title-exact?Toy%20Story%2...          0         0   
3  http://us.imdb.com/M/title-exact?Toy%20Story%2...          0         0   
4  http://us.imdb.com/M/title-exact?Toy%20Story%2...          0         0   

    Adventure    Animation   Children's    Comedy   ...   War    Western   \
0            0            1            1         1  ...      0          0   
1            0            1            1         1  ...      0          0   
2    

In [15]:
df_edata = df_edata.drop([" movie title "," release date ","IMDb URL ","age","zip code"],axis=1)

In [16]:
df_edata.head()

Unnamed: 0,movie id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Sci-Fi,Thriller,War,Western,user id,gender,occupation,age_group,rating,timestamp
0,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,1,M,technician,"(23.0, 26.0]",5,874965758
1,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,2,F,other,"(51.0, 73.0]",4,888550871
2,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,5,F,other,"(31.0, 35.0]",4,875635748
3,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,6,M,executive,"(40.0, 46.0]",4,883599478
4,1,0,0,0,1,1,1,0,0,0,...,0,0,0,0,10,M,lawyer,"(51.0, 73.0]",4,877888877


In [17]:
df_edata = pd.concat([df_edata,pd.get_dummies(df_edata['gender'], prefix='gender')],axis=1)
df_edata.drop(['gender'],axis=1, inplace=True)
df_edata = pd.concat([df_edata,pd.get_dummies(df_edata['occupation'], prefix='occupation')],axis=1)
df_edata.drop(['occupation'],axis=1, inplace=True)
df_edata = pd.concat([df_edata,pd.get_dummies(df_edata['age_group'], prefix='age_group')],axis=1)
df_edata.drop(['age_group'],axis=1, inplace=True)

In [18]:
print(df_edata)

        movie id    unknown    Action    Adventure    Animation   Children's   \
0               1          0         0            0            1            1   
1               1          0         0            0            1            1   
2               1          0         0            0            1            1   
3               1          0         0            0            1            1   
4               1          0         0            0            1            1   
...           ...        ...       ...          ...          ...          ...   
99995        1678          0         0            0            0            0   
99996        1679          0         0            0            0            0   
99997        1680          0         0            0            0            0   
99998        1681          0         0            0            0            0   
99999        1682          0         0            0            0            0   

        Comedy    Crime    

In [19]:
columns = [" unknown "," Action "," Adventure "," Animation ","Children's "," Comedy "," Crime "," Documentary ",
            " Drama "," Fantasy ","Film-Noir "," Horror "," Musical "," Mystery "," Romance "," Sci-Fi ", "Thriller ",
           " War "," Western ","gender_M","gender_F","occupation_administrator","occupation_artist","occupation_doctor",
           "occupation_educator","occupation_engineer","occupation_entertainment","occupation_executive","occupation_healthcare",
           "occupation_homemaker","occupation_lawyer","occupation_librarian","occupation_marketing","occupation_none",
          "occupation_other","occupation_programmer","occupation_retired","occupation_salesman","occupation_scientist",
           "occupation_student","occupation_technician","occupation_writer","age_group_(6.0, 20.0]","age_group_(20.0, 23.0]",
           "age_group_(23.0, 26.0]","age_group_(26.0, 29.0]","age_group_(29.0, 31.0]","age_group_(31.0, 35.0]","age_group_(35.0, 40.0]",
           "age_group_(40.0, 46.0]","age_group_(46.0, 51.0]","age_group_(51.0, 73.0]","rating"," movie id ","user id","timestamp"]

df_edata = pd.DataFrame(data=df_edata, columns=columns) 

In [20]:
print(df_edata.head())

    unknown    Action    Adventure    Animation   Children's    Comedy   \
0          0         0            0            1            1         1   
1          0         0            0            1            1         1   
2          0         0            0            1            1         1   
3          0         0            0            1            1         1   
4          0         0            0            1            1         1   

    Crime    Documentary    Drama    Fantasy   ...  age_group_(29.0, 31.0]  \
0        0              0        0          0  ...                       0   
1        0              0        0          0  ...                       0   
2        0              0        0          0  ...                       0   
3        0              0        0          0  ...                       0   
4        0              0        0          0  ...                       0   

   age_group_(31.0, 35.0]  age_group_(35.0, 40.0]  age_group_(40.0, 46.0]  \
0  

In [21]:
df_edata = df_edata.groupby('rating').head(3000)

In [22]:
X = df_edata.drop(['rating'],axis=1)
Y = df_edata['rating']

In [23]:
X.head()

Unnamed: 0,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,...,"age_group_(26.0, 29.0]","age_group_(29.0, 31.0]","age_group_(31.0, 35.0]","age_group_(35.0, 40.0]","age_group_(40.0, 46.0]","age_group_(46.0, 51.0]","age_group_(51.0, 73.0]",movie id,user id,timestamp
0,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,1,874965758
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,2,888550871
2,0,0,0,1,1,1,0,0,0,0,...,0,0,1,0,0,0,0,1,5,875635748
3,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,1,0,0,1,6,883599478
4,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,1,1,10,877888877


In [24]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 100)


In [25]:
regr = LinearRegression()
regr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
y_pred = regr.predict(x_test)

In [27]:
r_score = r2_score(y_test,y_pred)

In [28]:
y_test.shape

(4500,)

In [29]:
y_test=y_test.values.reshape(1,4500)

In [30]:
df = pd.DataFrame({'Actual':y_test.flatten(),'Predicted':y_pred.flatten()})

In [31]:
df = df.round(decimals=0)

In [32]:
print(df)

      Actual  Predicted
0          2        3.0
1          1        2.0
2          2        2.0
3          1        2.0
4          5        3.0
...      ...        ...
4495       2        3.0
4496       3        3.0
4497       1        1.0
4498       1        1.0
4499       4        4.0

[4500 rows x 2 columns]


In [33]:
print(r_score)

0.4810877158555832


In [34]:
print(y_pred,y_test)

[3.0962686  1.97260833 2.29305323 ... 1.15519133 1.17319937 3.88202287] [[2 1 2 ... 1 1 4]]


In [35]:
count = df_edata['rating'].value_counts()

In [36]:
print(count)

5    3000
4    3000
3    3000
2    3000
1    3000
Name: rating, dtype: int64
