## CAB420 Final Assignment


# Importing Library and Read Data from CSV files


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# Spliting data to trainning and testing set
from sklearn.model_selection import train_test_split
# Fitting Multiple Linear Regression to the trainning set
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

# Import dataset
rating_dataset = pd.read_csv("ratings.csv")
movies_dataset = pd.read_csv("movies.csv")

# Set the index by movieId, This line of code only able to execute once
movies_dataset.set_index('movieId', inplace = True)

# Convert genres to dummy variable dataset
genresDummy = movies_dataset['genres'].str.get_dummies(sep='|')

In [2]:
# Get all the genres values from the dataset
def get_all_genres():
    # a variable contains all the genre types
    genres = list()

    for row in movies_dataset.values:
        #  Sperating the genre by |
        Typestemp = row[1].split('|')
        #  Read all these type and put it to list
        for movietype in Typestemp:
            genres.append(movietype)
    return set(genres)

In [3]:
# Add the dummy data back to the dataset
for genre in get_all_genres():
    movies_dataset[genre] = genresDummy[genre]
    
# Cannot use year as parameter because in movie 3xxxx there is a movie does not have years
# movies_dataset["year"] = movies_dataset["title"].str.extract(r"\(([0-9]+)\)").astype(dtype=np.int)

    
# Filtering duplicate values in the MovieGenre list
movies_dataset = movies_dataset.drop(columns=['genres', 'title'], axis=1)

movies_dataset

Unnamed: 0_level_0,Drama,Adventure,Romance,Crime,Mystery,Musical,Horror,Fantasy,IMAX,Thriller,Action,(no genres listed),Western,Comedy,Children,Animation,Sci-Fi,War,Documentary,Film-Noir
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,1,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0
3,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
6,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
8,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
10,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0


In [4]:
full_rating_dataset = pd.merge(rating_dataset[["userId","movieId","rating"]], movies_dataset, on='movieId', how="left")
full_rating_dataset

Unnamed: 0,userId,movieId,rating,Drama,Adventure,Romance,Crime,Mystery,Musical,Horror,...,Action,(no genres listed),Western,Comedy,Children,Animation,Sci-Fi,War,Documentary,Film-Noir
0,1,2,3.5,0,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,1,29,3.5,1,1,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,1,32,3.5,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,1,47,3.5,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,50,3.5,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,112,3.5,0,1,0,1,0,0,0,...,1,0,0,1,0,0,0,0,0,0
6,1,151,4.0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
7,1,223,4.0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8,1,253,4.0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,1,260,4.0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0


In [5]:
X = full_rating_dataset.drop(columns=['rating'], axis=1).values
y = full_rating_dataset.iloc[:,2].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

In [6]:
y_pred

array([3.73573319, 3.42932539, 3.4347534 , ..., 3.35988794, 3.3774498 ,
       3.92814558])

In [7]:
y_test

array([4.5, 3. , 1. , ..., 2. , 3. , 4. ])

In [8]:
# The coefficients
print('Coefficients: \n', regressor.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [ 2.49194953e-08 -7.55886659e-07  1.92370621e-01  8.89942540e-02
  1.40887020e-02  1.88694316e-01  1.26251916e-01  5.22969912e-02
 -2.07403947e-01  6.09272120e-02  1.33929683e-01 -4.28586032e-02
 -1.30704407e-01 -3.26781412e-01  5.69659882e-02 -1.08933561e-01
 -3.01651539e-01  3.18688399e-01  1.03372341e-02  2.45365465e-01
  2.80855194e-01  2.75877699e-01]
Mean squared error: 1.07
Variance score: 0.04


In [9]:
def predict_Rating_For_One_User(rating_dataset):
    # Randomly Generate a user for doing linear regression to predict what will he / she giving the rating on a movie
    userID = random.randint(1, full_rating_dataset['userId'].max() + 1)
    
    # Set the index by movieId, This line of code only able to execute once
    rating_dataset.set_index('userId', inplace = True)
    
    user_rating_dataset = rating_dataset.loc[userID,:]
    
    # Rating
    X = user_rating_dataset.drop(columns=['rating'], axis=1).values
    y = user_rating_dataset.iloc[:,2].values
    
    X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)
    
    regressor = LinearRegression()
    regressor.fit(X_train,y_train)

    y_pred = regressor.predict(X_test)
    print("Prediction : ")
    print(y_pred)
    print("True Value is :")
    print(y_tes)
    

In [10]:
# Add 1 to Xtrain represent parameter0
X_train = np.append(arr = np.ones(( X_train.shape[0],1 )), values = X_train, axis = 1)

In [11]:
X_opt = X_train[:,:]

# Backward Elimination
import statsmodels.api as smf

regressor_OLS = smf.OLS(endog= y_train, exog= X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.036
Model:,OLS,Adj. R-squared:,0.036
Method:,Least Squares,F-statistic:,27200.0
Date:,"Tue, 21 May 2019",Prob (F-statistic):,0.0
Time:,20:20:01,Log-Likelihood:,-23219000.0
No. Observations:,16000210,AIC:,46440000.0
Df Residuals:,16000187,BIC:,46440000.0
Df Model:,22,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.4595,0.001,3776.725,0.000,3.458,3.461
x1,2.492e-08,6.45e-09,3.864,0.000,1.23e-08,3.76e-08
x2,-7.559e-07,1.37e-08,-55.344,0.000,-7.83e-07,-7.29e-07
x3,0.1924,0.001,296.869,0.000,0.191,0.194
x4,0.0890,0.001,117.769,0.000,0.088,0.090
x5,0.0141,0.001,19.857,0.000,0.013,0.015
x6,0.1887,0.001,240.713,0.000,0.187,0.190
x7,0.1263,0.001,120.962,0.000,0.124,0.128
x8,0.0523,0.001,37.447,0.000,0.050,0.055

0,1,2,3
Omnibus:,919063.91,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1085247.549
Skew:,-0.632,Prob(JB):,0.0
Kurtosis:,3.181,Cond. No.,18700000.0
