In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

# Spliting data to trainning and testing set
from sklearn.model_selection import train_test_split
# Fitting Multiple Linear Regression to the trainning set
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score

# Matplotlib setting 
%config InlineBackend.print_figure_kwargs = {'bbox_inches':None}
%config InlineBackend.rc = {'font.size': 30, 'figure.figsize': (30.0, 20.0), 'figure.facecolor': (1, 1, 1, 0), 'figure.subplot.bottom': 0.125, 'figure.edgecolor': (1, 1, 1, 0), 'figure.dpi': 500}

In [2]:
# Import dataset
rating_dataset = pd.read_csv("./data/ratings.csv")
movies_dataset = pd.read_csv("test.csv")

# Set the index by movieId, This line of code only able to execute once
movies_dataset.set_index('movieId', inplace = True)

full_rating_dataset = pd.merge(rating_dataset[["userId","movieId","rating"]], movies_dataset[["genre"]], on = 'movieId', how = "left")
full_rating_dataset['rating'] = full_rating_dataset['rating'] * 2
full_rating_dataset['rating'] = full_rating_dataset['rating'].astype(int)

full_rating_dataset

Unnamed: 0,userId,movieId,rating,genre
0,1,1,8,286
1,1,3,8,16400
2,1,6,8,65569
3,1,47,10,73728
4,1,50,10,73760
5,1,70,6,66577
6,1,101,10,16434
7,1,110,8,131201
8,1,151,10,147585
9,1,157,10,131088


In [3]:
X_df = full_rating_dataset.drop(columns=['rating'], axis=1)
y_df = full_rating_dataset.iloc[:,2]

X = X_df.values
y = y_df.values

from sklearn.linear_model import LogisticRegression

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3)

# Fit one-vs-rest logistic regression classifier
lr_ovr = LogisticRegression()
lr_ovr.fit(X_train, y_train)

print("OVR training accuracy:", lr_ovr.score(X_train, y_train))
print("OVR test accuracy    :", lr_ovr.score(X_test, y_test))

# Fit softmax classifier
lr_mn = LogisticRegression(multi_class="multinomial",solver="lbfgs")
lr_mn.fit(X_train, y_train)

print("Softmax training accuracy:", lr_mn.score(X_train, y_train))
print("Softmax test accuracy    :", lr_mn.score(X_test, y_test))



OVR training accuracy: 0.2663880427852943
OVR test accuracy    : 0.2590327592476282
Softmax training accuracy: 0.24370616986611887
Softmax test accuracy    : 0.24081848533932762




In [4]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

clf.score(X_train,y_train)

1.0

In [5]:
clf.score(X_test,y_test)

0.2356946877789164

In [6]:
yhat = clf.predict(X_test)

yhat

array([10,  6, 10, ...,  8,  5,  9])

In [7]:
y_test

array([8, 8, 6, ..., 8, 6, 5])