## CAB420 Final Assignment


# Importing Library and Read Data from CSV files


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# Spliting data to trainning and testing set
from sklearn.model_selection import train_test_split
# Fitting Multiple Linear Regression to the trainning set
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

## Data Preprocessing

In [2]:
# Import dataset
rating_dataset = pd.read_csv("ratings.csv")
movies_dataset = pd.read_csv("movies.csv")

# Set the index by movieId, This line of code only able to execute once
movies_dataset.set_index('movieId', inplace = True)

# Convert genres to dummy variable dataset
movies_dataset = movies_dataset['genres'].str.get_dummies(sep='|')

# Cannot use year as parameter because in movie 3xxxx there is a movie does not have years
# movies_dataset["year"] = movies_dataset["title"].str.extract(r"\(([0-9]+)\)").astype(dtype=np.int)

# Filtering duplicate values in the MovieGenre list
movies_dataset = movies_dataset.drop(columns=["(no genres listed)","Western","IMAX","Film-Noir","Children"], axis=1)

movies_dataset

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,0,1,1,1,0,0,0,1,0,0,0,0,0,0,0
2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0
5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0
7,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
8,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
10,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0


## Combine Rating and Movie Dataset to one Dataset

In [3]:
full_rating_dataset = pd.merge(rating_dataset[["userId","movieId","rating"]], movies_dataset, on = 'movieId', how = "left")
full_rating_dataset

Unnamed: 0,userId,movieId,rating,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War
0,1,2,3.5,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
1,1,29,3.5,0,1,0,0,0,0,1,1,0,0,1,0,1,0,0
2,1,32,3.5,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0
3,1,47,3.5,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
4,1,50,3.5,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0
5,1,112,3.5,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0
6,1,151,4.0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1
7,1,223,4.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
8,1,253,4.0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0
9,1,260,4.0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0


## Linear Regression for Predicting a user how many marks will he/she giving to a movie according to he/she previous rating to other movie and the others how they rate this movie


In [None]:
X = full_rating_dataset.drop(columns=['rating'], axis=1).values
y = full_rating_dataset.iloc[:,2].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

In [None]:
y_pred.round()

array([4., 3., 3., ..., 3., 3., 4.])

In [None]:
y_test

array([4.5, 3. , 1. , ..., 2. , 3. , 4. ])

In [None]:
# The coefficients
print('Coefficients: \n', regressor.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [ 2.46376409e-08 -3.64357282e-07 -1.14819494e-01  8.01997884e-02
  1.31153519e-01 -1.19192325e-01  2.05258933e-01  2.93772763e-01
  1.96961436e-01  2.38875631e-02 -1.98096048e-01  1.57955178e-02
  1.44707401e-01  3.17527700e-02  1.84426538e-02 -3.85869977e-02
  2.51741640e-01]
Mean squared error: 1.07
Variance score: 0.03


In [None]:
# Add 1 to Xtrain represent parameter0
X_train = np.append(arr = np.ones(( X_train.shape[0],1 )), values = X_train, axis = 1)

In [None]:
X_opt = X_train[:,:]

# Backward Elimination
import statsmodels.api as smf

regressor_OLS = smf.OLS(endog= y_train, exog= X_opt).fit()
regressor_OLS.summary()

## Linear Regression for predicting a user will rate to a new movies according to the previous rating he gave to the other movies only

In [None]:
# Randomly Generate a user for doing linear regression to predict what will he / she giving the rating on a movie
userID = random.randint(1, full_rating_dataset['userId'].max() + 1)

rating_df_for_one_user = full_rating_dataset.loc[(full_rating_dataset.userId == userID)]

X = rating_df_for_one_user.drop(columns=['rating','userId'], axis=1).values
y = rating_df_for_one_user.iloc[:,2].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

y_pred.round()

In [None]:
y_test

In [None]:
# The coefficients
print('Coefficients: \n', regressor.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

In [None]:
# Add 1 to Xtrain represent parameter0
X_train = np.append(arr = np.ones(( X_train.shape[0],1 )), values = X_train, axis = 1)

X_opt = X_train[:,:]

# Backward Elimination
import statsmodels.api as smf

regressor_OLS = smf.OLS(endog= y_train, exog= X_opt).fit()
regressor_OLS.summary()

In [None]:
rating_df_for_one_user.drop(columns=['rating','userId'], axis=1)