## CAB420 Final Assignment


# Importing Library and Read Data from CSV files


In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# Spliting data to trainning and testing set
from sklearn.model_selection import train_test_split
# Fitting Multiple Linear Regression to the trainning set
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score

# Import dataset
rating_dataset = pd.read_csv("ratings.csv")
movies_dataset = pd.read_csv("movies.csv")

# Set the index by movieId, This line of code only able to execute once
movies_dataset.set_index('movieId', inplace = True)

# Convert genres to dummy variable dataset
genresDummy = movies_dataset['genres'].str.get_dummies(sep='|')

In [2]:
# Get all the genres values from the dataset
def get_all_genres():
    # a variable contains all the genre types
    genres = list()

    for row in movies_dataset.values:
        #  Sperating the genre by |
        Typestemp = row[1].split('|')
        #  Read all these type and put it to list
        for movietype in Typestemp:
            genres.append(movietype)
    return set(genres)

In [3]:
# Add the dummy data back to the dataset
for genre in get_all_genres():
    movies_dataset[genre] = genresDummy[genre]
    
# Cannot use year as parameter because in movie 3xxxx there is a movie does not have years
# movies_dataset["year"] = movies_dataset["title"].str.extract(r"\(([0-9]+)\)").astype(dtype=np.int)

# Filtering duplicate values in the MovieGenre list
movies_dataset = movies_dataset.drop(columns=['genres', 'title',"(no genres listed)","Western","IMAX"], axis=1)

movies_dataset

Unnamed: 0_level_0,Crime,Romance,Sci-Fi,Fantasy,Action,Film-Noir,Thriller,Children,Animation,Mystery,War,Documentary,Adventure,Comedy,Musical,Drama,Horror
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,0,0,0,1,0,0,0,1,1,0,0,0,1,1,0,0,0
2,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
6,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
8,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0
9,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
10,0,0,0,0,1,0,1,0,0,0,0,0,1,0,0,0,0


In [4]:
full_rating_dataset = pd.merge(rating_dataset[["userId","movieId","rating"]], movies_dataset, on = 'movieId', how = "left")
full_rating_dataset

Unnamed: 0,userId,movieId,rating,Crime,Romance,Sci-Fi,Fantasy,Action,Film-Noir,Thriller,Children,Animation,Mystery,War,Documentary,Adventure,Comedy,Musical,Drama,Horror
0,1,2,3.5,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0
1,1,29,3.5,0,0,1,1,0,0,0,0,0,1,0,0,1,0,0,1,0
2,1,32,3.5,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0
3,1,47,3.5,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
4,1,50,3.5,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
5,1,112,3.5,1,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0
6,1,151,4.0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
7,1,223,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
8,1,253,4.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1
9,1,260,4.0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0


## Linear Regression for Predicting a user how many marks will he/she giving to a movie according to he/she previous rating to other movie and the others how they rate this movie


In [5]:
X = full_rating_dataset.drop(columns=['rating'], axis=1).values
y = full_rating_dataset.iloc[:,2].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)

In [6]:
y_pred

array([3.75590251, 3.4421341 , 3.44366566, ..., 3.36007019, 3.38503921,
       3.93625747])

In [7]:
y_test

array([4.5, 3. , 1. , ..., 2. , 3. , 4. ])

In [8]:
# The coefficients
print('Coefficients: \n', regressor.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [ 2.48258052e-08 -4.96395413e-07  1.88575535e-01  1.34746914e-02
  1.16889047e-02  6.18599329e-02 -1.28658380e-01  2.72802698e-01
 -4.66922691e-02 -3.01783293e-01  3.22256316e-01  1.26064865e-01
  2.44107968e-01  2.73332618e-01  9.41127665e-02 -1.14611929e-01
  5.99616614e-02  1.90356617e-01 -2.10549452e-01]
Mean squared error: 1.07
Variance score: 0.04


In [9]:
# Add 1 to Xtrain represent parameter0
X_train = np.append(arr = np.ones(( X_train.shape[0],1 )), values = X_train, axis = 1)

In [10]:
X_opt = X_train[:,:]

# Backward Elimination
import statsmodels.api as smf

regressor_OLS = smf.OLS(endog= y_train, exog= X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.036
Model:,OLS,Adj. R-squared:,0.036
Method:,Least Squares,F-statistic:,31140.0
Date:,"Thu, 23 May 2019",Prob (F-statistic):,0.0
Time:,21:47:04,Log-Likelihood:,-23222000.0
No. Observations:,16000210,AIC:,46440000.0
Df Residuals:,16000190,BIC:,46440000.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.4638,0.001,3811.760,0.000,3.462,3.466
x1,2.483e-08,6.45e-09,3.849,0.000,1.22e-08,3.75e-08
x2,-4.964e-07,1.32e-08,-37.620,0.000,-5.22e-07,-4.71e-07
x3,0.1886,0.001,240.837,0.000,0.187,0.190
x4,0.0135,0.001,19.000,0.000,0.012,0.015
x5,0.0117,0.001,14.622,0.000,0.010,0.013
x6,0.0619,0.001,67.152,0.000,0.060,0.064
x7,-0.1287,0.001,-178.733,0.000,-0.130,-0.127
x8,0.2728,0.003,106.069,0.000,0.268,0.278

0,1,2,3
Omnibus:,920632.514,Durbin-Watson:,2.0
Prob(Omnibus):,0.0,Jarque-Bera (JB):,1087396.489
Skew:,-0.632,Prob(JB):,0.0
Kurtosis:,3.181,Cond. No.,806000.0


## Linear Regression for predicting a user will rate to a new movies according to the previous rating he gave to the other movies only

In [11]:
# Randomly Generate a user for doing linear regression to predict what will he / she giving the rating on a movie
userID = random.randint(1, full_rating_dataset['userId'].max() + 1)

rating_df_for_one_user = full_rating_dataset.loc[(full_rating_dataset.userId == userID)]

X = rating_df_for_one_user.drop(columns=['rating','userId'], axis=1).values
y = rating_df_for_one_user.iloc[:,2].values

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 0)

regressor = LinearRegression()
regressor.fit(X_train,y_train)

y_pred = regressor.predict(X_test)
y_pred

array([3.58752825, 4.85049549, 3.74183095, 4.78172354, 4.26829915,
       4.22644049, 4.4572498 , 3.63977433, 4.79523291])

In [12]:
y_test

array([3.5, 3.5, 4.5, 4.5, 4. , 4.5, 4.5, 3. , 4.5])

In [13]:
# The coefficients
print('Coefficients: \n', regressor.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(y_test, y_pred))

Coefficients: 
 [ 2.60370174e-06 -7.94225946e-02  1.02056425e+00  3.86220901e-01
  1.19839642e+00  1.34849353e+00 -2.22044605e-16  5.68857579e-01
  5.44957622e-01  4.82982228e-01 -4.38678571e-01  1.76573462e-01
  0.00000000e+00  5.36913049e-02 -3.08004990e-01  0.00000000e+00
 -3.00715560e-01 -2.57299535e-02]
Mean squared error: 0.35
Variance score: -0.15


In [14]:
# Add 1 to Xtrain represent parameter0
X_train = np.append(arr = np.ones(( X_train.shape[0],1 )), values = X_train, axis = 1)

X_opt = X_train[:,:]

# Backward Elimination
import statsmodels.api as smf

regressor_OLS = smf.OLS(endog= y_train, exog= X_opt).fit()
regressor_OLS.summary()

  return self.params / self.bse
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,y,R-squared:,0.734
Model:,OLS,Adj. R-squared:,0.485
Method:,Least Squares,F-statistic:,2.942
Date:,"Thu, 23 May 2019",Prob (F-statistic):,0.0197
Time:,21:47:07,Log-Likelihood:,-24.373
No. Observations:,32,AIC:,80.75
Df Residuals:,16,BIC:,104.2
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,2.4767,0.495,5.005,0.000,1.428,3.526
x1,2.604e-06,1.09e-05,0.238,0.815,-2.06e-05,2.58e-05
x2,-0.0794,0.392,-0.202,0.842,-0.911,0.752
x3,1.0206,0.662,1.541,0.143,-0.383,2.424
x4,0.3862,0.683,0.566,0.580,-1.061,1.834
x5,1.1984,0.519,2.311,0.034,0.099,2.298
x6,1.3485,0.418,3.224,0.005,0.462,2.235
x7,-2.156e-16,3.62e-16,-0.596,0.559,-9.82e-16,5.51e-16
x8,0.5689,0.444,1.280,0.219,-0.373,1.511

0,1,2,3
Omnibus:,1.548,Durbin-Watson:,2.033
Prob(Omnibus):,0.461,Jarque-Bera (JB):,0.615
Skew:,-0.261,Prob(JB):,0.735
Kurtosis:,3.434,Cond. No.,4.69e+36


In [15]:
rating_df_for_one_user.drop(columns=['rating','userId'], axis=1)

Unnamed: 0,movieId,Crime,Romance,Sci-Fi,Fantasy,Action,Film-Noir,Thriller,Children,Animation,Mystery,War,Documentary,Adventure,Comedy,Musical,Drama,Horror
14582909,110,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0
14582910,267,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
14582911,293,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0
14582912,296,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0
14582913,541,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
14582914,589,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
14582915,745,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0
14582916,934,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
14582917,1089,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0
14582918,1148,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0
