In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
raw_data = pd.read_csv("train.csv")
raw_data.head()

Unnamed: 0,userID,movieID,rating
0,4490,2109,4
1,5839,3471,4
2,5382,150,3
3,1262,1237,5
4,6005,2273,4


In [3]:
pivot_ii= raw_data.pivot(index="movieID", columns= "userID", values="rating")
pivot_ii.head()

userID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.0,,,,,4.0,,,4.0,,...,5.0,5.0,4.0,,4.0,,,,,5.0
2,,,,,,,,,,,...,5.0,,,,,,,,,
3,,,,,,1.0,,,,,...,,,,,,,,,,
4,,,,,,2.0,,,,,...,,,3.0,,,,,,,
5,,,,,,1.0,,,,,...,,,,,,,,,,


In [4]:
all_mean = pivot_ii.mean().mean()

In [5]:
pivot_ii.shape

(3705, 6040)

In [6]:
scaled_df = pivot_ii - pivot_ii.mean(axis=1).values.reshape(-1, 1)
scaled_df.fillna(0, inplace=True)
scaled_df.head()

userID,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movieID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-1.145287,0.0,0.0,0.0,0.0,-0.145287,0.0,0.0,-0.145287,0.0,...,0.854713,0.854713,-0.145287,0.0,-0.145287,0.0,0.0,0.0,0.0,0.854713
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,-2.021142,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,-0.733728,0.0,0.0,0.0,0.0,...,0.0,0.0,0.266272,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,-2.017007,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
test_no_labels = pd.read_csv("test_without_labels.csv")
test_no_labels.head()

Unnamed: 0,IDs
0,629_2683
1,601_904
2,5673_3717
3,5616_1721
4,1099_3697


In [8]:
test_no_labels["userID"] = test_no_labels.IDs.str.split('_').str[0].astype(int)
test_no_labels["movieID"] = test_no_labels.IDs.str.split('_').str[1].astype(int)
test_no_labels.head()

Unnamed: 0,IDs,userID,movieID
0,629_2683,629,2683
1,601_904,601,904
2,5673_3717,5673,3717
3,5616_1721,5616,1721
4,1099_3697,1099,3697


In [9]:
test_ii = test_no_labels.drop("IDs",axis=1)

In [10]:
pred_ratings_ii = []

for userId,movie in test_ii.itertuples(index=False):
    k = 50
    rated_indices = pivot_ii[np.isnan(pivot_ii[userId]) == False].index
    filtered_df = scaled_df.loc[rated_indices, :]
    if len(filtered_df) == 0:
        pred = all_mean
    else:
        if len(filtered_df) < k:
            k = len(filtered_df)
        
        nn = NearestNeighbors(n_neighbors=k, algorithm="kd_tree")
        nn.fit(filtered_df)
        test_movie = scaled_df.loc[[movie], :]
        close_indices = nn.kneighbors(test_movie ,return_distance=False)[0]
        close_friends = filtered_df.iloc[close_indices, :]
        pred = pivot_ii.loc[movie, :].mean() + close_friends[userId].mean()
        pred_ratings_ii.append(pred)

In [11]:
len(pred_ratings_ii)

9999

In [12]:
for i in range(len(pred_ratings_ii)): 
    if pred_ratings_ii[i] > 5:
        pred_ratings_ii[i] = 5
    elif pred_ratings_ii[i] < 1:
        pred_ratings_ii[i] = 1

In [13]:
test_no_labels["rating"] = pred_ratings_ii
test_no_labels.drop(columns=["userID","movieID"],axis=1,inplace=True)
test_no_labels.to_csv("result_ii_2804.csv", index=False)