## 1.1 - Movielens full - EDA

---

In [2]:
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix

In [3]:
movies_path = "../Data/ml-latest/movies.csv"
ratings_path = "../Data/ml-latest/ratings.csv"

df_movies = pd.read_csv(movies_path, usecols=["movieId", "title"], dtype= {"movieId": "int32", "title": "str"})
df_ratings = pd.read_csv(ratings_path, usecols=["userId", "movieId", "rating"], dtype= {"userId": "int32", "movieId": "int32", "rating": "float32"})

### **a)** 

### Gör en EDA för att förstå datasetet. Inkludera olika slags plots. Begränsa dig inte till frågorna nedan, utan försök undersöka fler aspekter av datan.

---

In [4]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int32 
 1   title    58098 non-null  object
dtypes: int32(1), object(1)
memory usage: 681.0+ KB


In [5]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27753444 entries, 0 to 27753443
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 317.6 MB


In [6]:
df_movies.describe()

Unnamed: 0,movieId
count,58098.0
mean,111919.516197
std,59862.660956
min,1.0
25%,72437.75
50%,126549.0
75%,161449.5
max,193886.0


In [7]:
df_ratings.describe()

Unnamed: 0,userId,movieId,rating
count,27753440.0,27753440.0,27753440.0
mean,141942.0,18488.0,3.530446
std,81707.4,35102.63,1.066353
min,1.0,1.0,0.5
25%,71176.0,1097.0,3.0
50%,142022.0,2716.0,3.5
75%,212459.0,7150.0,4.0
max,283228.0,193886.0,5.0


### **b)** 
### Vilka är de 10 filmerna med flest ratings?

---

In [8]:
df_num_ratings = df_ratings.groupby(["movieId"], as_index=True).size()

df_top10_num_ratings = (
    df_num_ratings.sort_values(axis=0, ascending=False).head(10).reset_index()
)

for i in range(len(df_top10_num_ratings)):
    print(
        f"{i+1}: {(df_movies['title'].loc[df_movies['movieId'] == int(df_top10_num_ratings['movieId'].iloc[[i]])]).to_string(header=False, index=False)}, Ratings: {(df_top10_num_ratings[0].iloc[[i]]).to_string(header=False, index=False)}"
    )


1: Shawshank Redemption, The (1994), Ratings: 97999
2: Forrest Gump (1994), Ratings: 97040
3: Pulp Fiction (1994), Ratings: 92406
4: Silence of the Lambs, The (1991), Ratings: 87899
5: Matrix, The (1999), Ratings: 84545
6: Star Wars: Episode IV - A New Hope (1977), Ratings: 81815
7: Jurassic Park (1993), Ratings: 76451
8: Schindler's List (1993), Ratings: 71516
9: Braveheart (1995), Ratings: 68803
10: Toy Story (1995), Ratings: 68469


### **c)** 
### Beräkna den genomsnittliga ratingen för dessa 10 filmerna med flest ratings.

---

In [9]:
df_top10_mean_ratings = df_ratings.groupby(["movieId"], as_index=False).mean()

for i in range(len(df_top10_num_ratings)):
    print(
        f"{i+1}: {(df_movies['title'].loc[df_movies['movieId'] == int(df_top10_num_ratings['movieId'].iloc[[i]])]).to_string(header=False, index=False)}, Rating: {float((df_top10_mean_ratings['rating'].loc[df_top10_mean_ratings['movieId'] == int(df_top10_num_ratings['movieId'].iloc[[i]])]).to_string(header=False, index=False)):.1f}"
    )

1: Shawshank Redemption, The (1994), Rating: 4.4
2: Forrest Gump (1994), Rating: 4.1
3: Pulp Fiction (1994), Rating: 4.2
4: Silence of the Lambs, The (1991), Rating: 4.2
5: Matrix, The (1999), Rating: 4.1
6: Star Wars: Episode IV - A New Hope (1977), Rating: 4.1
7: Jurassic Park (1993), Rating: 3.7
8: Schindler's List (1993), Rating: 4.3
9: Braveheart (1995), Rating: 4.0
10: Toy Story (1995), Rating: 3.9


### **d)**
### Gör en plot över årtal och antalet filmer representerade i datasetet.

---

In [10]:
df_movies_year = df_movies
df_movies_year["year"] = df_movies["title"].str.extract(r'\((\d{4})\)')
df_movies_per_year = df_movies_year.groupby("year").size().reset_index(name="count")

fig = px.bar(df_movies_per_year, x="year", y= "count", title='Number of Movies per Year')
fig.show()

### **e)** 
### Gör en plot över antalet ratings mot movieId.
---

In [11]:
df_num_ratings = df_num_ratings.reset_index(name= "count")

In [12]:
fig = px.histogram(df_num_ratings, x="movieId", y="count", title='Number of Ratings per Movie')
#fig.show()

### **f)** 
### Beräkna genomsnittliga ratings för de top 10 filmerna med flest ratings. Gör ett stapeldiagram över dessa.

---

In [13]:
df_mean_count_ratings = pd.merge(df_num_ratings, df_top10_mean_ratings, on="movieId", how="outer")
df_mean_count_ratings = pd.merge(df_mean_count_ratings, df_movies, on="movieId", how="outer")
df_mean_count_ratings = df_mean_count_ratings.sort_values(by="count", axis=0, ascending=False).head(10).reset_index()

In [14]:
fig = px.bar(df_mean_count_ratings, x="title", y="rating", title="Mean Rating of top 10 most rated Movies")
fig.show()

## 1.2 Skapa gles matris

---

In [15]:
#movies_users = df_ratings.pivot(index="movieId", columns="userId", values="rating").fillna(0)
#mat_movies_users = csr_matrix(movies_users.values)
#movies_users

In [16]:
df_ratings

Unnamed: 0,userId,movieId,rating
0,1,307,3.5
1,1,481,3.5
2,1,1091,1.5
3,1,1257,4.5
4,1,1449,4.5
...,...,...,...
27753439,283228,8542,4.5
27753440,283228,8712,4.5
27753441,283228,34405,4.5
27753442,283228,44761,4.5


In [17]:
df_ratings.drop(df_ratings[(df_ratings["movieId"] > 10000) & (df_ratings["movieId"] < 24000)].index, inplace= True)
df_ratings.drop(df_ratings[(df_ratings["movieId"] > 28000) & (df_ratings["movieId"] < 30000)].index, inplace= True)
df_ratings.drop(df_ratings[df_ratings["movieId"] > 120000].index, inplace= True)
df_ratings.drop(df_ratings[df_ratings["rating"] < 1].index, inplace= True)

In [18]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26608398 entries, 0 to 27753443
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 507.5 MB


In [19]:
d = df_ratings.sort_values(by="movieId", axis=0, ascending=False)

In [20]:
d.head()

Unnamed: 0,userId,movieId,rating
26354120,269098,119977,2.0
12013687,123100,119977,1.0
15333113,156633,119977,1.5
24322254,248694,119977,2.5
2275857,23415,119977,1.0


In [21]:
d

Unnamed: 0,userId,movieId,rating
26354120,269098,119977,2.0
12013687,123100,119977,1.0
15333113,156633,119977,1.5
24322254,248694,119977,2.5
2275857,23415,119977,1.0
...,...,...,...
24622758,251691,1,4.0
11162757,114703,1,4.0
7732729,79664,1,4.0
241569,2348,1,4.5


In [22]:
rating_per_user = df_ratings[["userId", "rating"]]
rating_per_user = rating_per_user.groupby("userId").size().reset_index(name="count")



In [23]:
df_ratings.sort_values(by="movieId", axis= 0, ascending= True, inplace= True)
movies_users = df_ratings.pivot(index="movieId", columns="userId", values="rating").fillna(0)


The following operation may generate 7326475804 cells in the resulting pandas object.



In [24]:
movies_users

userId,1,2,3,4,5,6,7,8,9,10,...,283219,283220,283221,283222,283223,283224,283225,283226,283227,283228
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,5.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5
2,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119964,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119973,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
df_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26608398 entries, 24999026 to 19769050
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int32  
 1   movieId  int32  
 2   rating   float32
dtypes: float32(1), int32(2)
memory usage: 507.5 MB


In [26]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from fuzzywuzzy import process

mat_movies_users = csr_matrix(movies_users.values)

model_KNN = NearestNeighbors(metric= "cosine", algorithm= "brute", n_neighbors= 20)

model_KNN.fit(mat_movies_users)





In [43]:
def recommender(movie_name, data, model, n_recommendations):

    model.fit(data)
    index = process.extractOne(movie_name, df_movies["title"])[2]
    print(f"Movie Selected: {df_movies['title'][index]}, Index: {index}")
    print("Serching for recommendations....")

    distances, indices = model.kneighbors(data[index], n_neighbors = n_recommendations)

    for i in indices:

        print(df_movies["title"][i].where(i!=index))

    
recommender("Alien", mat_movies_users, model_KNN, 20)

Movie Selected: Aliens (1986), Index: 1175
Serching for recommendations....
1175                                                  NaN
1188                                         Alien (1979)
1212                               Terminator, The (1984)
537                                   Blade Runner (1982)
1171    Star Wars: Episode V - The Empire Strikes Back...
1017                                      Die Hard (1988)
583                     Terminator 2: Judgment Day (1991)
1263            Indiana Jones and the Last Crusade (1989)
1173    Raiders of the Lost Ark (Indiana Jones and the...
2831                                  Total Recall (1990)
1184    Star Wars: Episode VI - Return of the Jedi (1983)
1104                                    Abyss, The (1989)
3438                                      Predator (1987)
1242                            Back to the Future (1985)
1356                                          Jaws (1975)
257             Star Wars: Episode IV - A New Hope (19

In [35]:
df_movies[df_movies["movieId"] == 90]

Unnamed: 0,movieId,title,year
89,90,The Journey of August King (1995),1995


In [37]:
df_movies.iloc[90]

movieId                    92
title      Mary Reilly (1996)
year                     1996
Name: 90, dtype: object

In [38]:
df_ratings[df_ratings["movieId"] == 92]

Unnamed: 0,userId,movieId,rating
8475456,87367,92,3.0
18427786,188017,92,4.0
15856515,161925,92,3.0
12074833,123636,92,2.0
22706831,232094,92,3.0
...,...,...,...
2138229,21946,92,1.0
4320473,44388,92,3.0
743162,7565,92,3.0
17729692,181293,92,2.0


In [32]:
df_movies_year[df_movies_year["year"] == "2001"]

Unnamed: 0,movieId,title,year
3959,4052,Antitrust (2001),2001
3960,4053,Double Take (2001),2001
3961,4054,Save the Last Dance (2001),2001
3963,4056,"Pledge, The (2001)",2001
3975,4068,Sugar & Spice (2001),2001
...,...,...,...
57670,192819,The Bare Wench Project 2: Scared Topless (2001),2001
57763,193061,Zu Warriors (2001),2001
57796,193179,Making the Connection: Untold Stories of 'The ...,2001
57806,193199,Shark Hunter (2001),2001
