In [71]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("grouplens/movielens-20m-dataset")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1


In [72]:
import os
import pandas as pd

path = '/root/.cache/kagglehub/datasets/grouplens/movielens-20m-dataset/versions/1'

# Load data
movies = pd.read_csv(f'{path}/movie.csv')
ratings = pd.read_csv(f'{path}/rating.csv')
genome_scores = pd.read_csv(f'{path}/genome_scores.csv')
genome_tags = pd.read_csv(f'{path}/genome_tags.csv')
tags = pd.read_csv(f'{path}/tag.csv')

##2. Grouping Movies Together!

###2.1 Feature Engineering

The dataset provided isn’t particularly clean or well-structured to represent the features of the movies. Therefore, the first step is to create a more suitable set of attributes (variables, features, covariates) to represent the movies based on the available information. Below are the variables or features that will be created for clustering:

1. `movieid` A unique identifier for each movie.  
2. `genres` A list of genres associated with the movie.
3. `ratings_avg` The average rating provided by users for the movie, calculated from the dataset.  
4. `relevant_genome_tag` The most relevant tag assigned to the movie based on the genome dataset.  
5. `common_user_tag` The most frequently assigned user tag for the movie.  
6. `year` The year of realese.
7. `ratings_count` The totale number of ratings for the movie, representing its popularity.
8. `tag_count` The total number of tag for the movie.



In [73]:
#created a one-hot encoded DataFrame for the genres column, splitting values by the | delimiter.
genres = movies['genres'].str.get_dummies('|')

In [None]:
genres

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27273,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27274,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27275,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
27276,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [74]:
# Calculate average rating, count of ratings, and rating std
movie_ratings = ratings.groupby('movieId').agg(
    ratings_avg=('rating', 'mean'),
    ratings_count=('rating', 'count')
).reset_index()


In [75]:
# genome tags
relevant_genome_tag = (
    #create a group for each movie with all the tags ID
    genome_scores.groupby('movieId')
    #Order the tags ID by relevance score for each movie and took the most relevant.
    .apply(lambda df: df.sort_values('relevance', ascending=False).iloc[0])
    .reset_index(drop=True)
)
#Add the corresponding tag name to each tag ID by merging with the genome_tags DataFrame
relevant_genome_tag = relevant_genome_tag.merge(genome_tags, on='tagId')


  .apply(lambda df: df.sort_values('relevance', ascending=False).iloc[0])


In [None]:
relevant_genome_tag

Unnamed: 0,movieId,tagId,relevance,tag
0,1.0,1036.0,0.99925,toys
1,2.0,29.0,0.98100,adventure
2,3.0,451.0,0.97450,good sequel
3,4.0,1116.0,0.97675,women
4,5.0,451.0,0.96575,good sequel
...,...,...,...,...
10376,130578.0,82.0,0.88325,assassination
10377,130840.0,863.0,0.96500,romance
10378,131013.0,230.0,0.98425,comedy
10379,131168.0,128.0,0.97300,betrayal


In [None]:
# User tags
#create a group for each movie with all the user tags, calculate the mode of the tags that returns the most common value
common_user_tag = tags.groupby('movieId')['tag'].agg(lambda x: x.mode()[0]).reset_index()


In [None]:
common_user_tag

Unnamed: 0,movieId,tag
0,1,Pixar
1,2,Robin Williams
2,3,moldy
3,4,characters
4,5,steve martin
...,...,...
19540,131054,dinosaurs
19541,131082,Yoshitomo Nara
19542,131164,Vietnam War
19543,131170,alternate reality


In [76]:
tag_count = tags.groupby('movieId').agg(
    tag_count=('tag', 'count')
).reset_index()

In [None]:
tag_count

Unnamed: 0,movieId,tag_count
0,1,436
1,2,123
2,3,18
3,4,6
4,5,22
...,...,...
19540,131054,1
19541,131082,2
19542,131164,1
19543,131170,1


In [77]:
# Extract the year from the movie title
movies['year'] = movies['title'].str.extract(r'\((\d{4})\)').astype(float)

In [78]:
movies_features = movies.merge(movie_ratings, on='movieId', how='left') \
    .merge(relevant_genome_tag[['movieId', 'tag']], on='movieId', how='left') \
    .rename(columns={'tag': 'relevant_genome_tag'}) \
    .join(genres, how='left') \
    .merge(common_user_tag.rename(columns={'tag': 'common_user_tag'}), on='movieId', how='left') \
    .merge(tag_count, on='movieId', how='left')


In [None]:
print(movies_features.head())

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres    year  ratings_avg  \
0  Adventure|Animation|Children|Comedy|Fantasy  1995.0     3.921240   
1                   Adventure|Children|Fantasy  1995.0     3.211977   
2                               Comedy|Romance  1995.0     3.151040   
3                         Comedy|Drama|Romance  1995.0     2.861393   
4                                       Comedy  1995.0     3.064592   

   ratings_count relevant_genome_tag  (no genres listed)  Action  Adventure  \
0        49695.0                toys                   0       0          1   
1        22243.0           adventure                   0       0          1   
2        12735.0         good seque

In [None]:
movies_features.isna().sum()

Unnamed: 0,0
movieId,0
ratings_avg,0
ratings_count,0
common_user_tag,0
tag_count,0
year,0
(no genres listed),0
Action,0
Adventure,0
Animation,0


In [None]:
movies_features.shape

(27278, 29)

dato che i dati sono inconcludenti per il relevant tag preferisco non usarlo come feature.
per il rating quelli senza rating ho messo 0 nell'avg perche lo 0 non era un valore possibile da mettere non lo confondo con un altro valore. Nel count ho messo 0.
il common lo risolvo con un valore tipo 'no tag' che mi creerà un cluster artificiale

In [83]:
movies_features['ratings_count'] = movies_features['ratings_count'].fillna(0)
movies_features['ratings_avg'] = movies_features['ratings_avg'].fillna(0)
movies_features['common_user_tag'] = movies_features['common_user_tag'].fillna('ciccia')
movies_features['tag_count'] = movies_features['tag_count'].fillna(0)
movies_features['year'] = movies_features['year'].fillna(0)

In [84]:
# Get the names of the binary genre columns
genre_columns = genres.columns.tolist()

# Select the additional features you want to include in the final dataset
additional_features = ['movieId', 'ratings_avg', 'ratings_count', 'common_user_tag', 'tag_count', 'year']

# Combine the genre columns with the additional features into a new DataFrame
movies_features = movies_features[additional_features + genre_columns]

# Display the first few rows of the new DataFrame
print(movies_features.head())


   movieId  ratings_avg  ratings_count common_user_tag  tag_count    year  \
0        1     3.921240        49695.0           Pixar      436.0  1995.0   
1        2     3.211977        22243.0  Robin Williams      123.0  1995.0   
2        3     3.151040        12735.0           moldy       18.0  1995.0   
3        4     2.861393         2756.0      characters        6.0  1995.0   
4        5     3.064592        12161.0    steve martin       22.0  1995.0   

   (no genres listed)  Action  Adventure  Animation  ...  Film-Noir  Horror  \
0                   0       0          1          1  ...          0       0   
1                   0       0          1          0  ...          0       0   
2                   0       0          0          0  ...          0       0   
3                   0       0          0          0  ...          0       0   
4                   0       0          0          0  ...          0       0   

   IMAX  Musical  Mystery  Romance  Sci-Fi  Thriller  War  Wes

####**Question**:
 If you have accurately identified and applied the methods for representing the features, you should have more than eight features! How could this happen? Take a moment to think about it.
####**Answer**:

commento sul numero feature maggiore di 8 perchè per ogni genere c'è una colonna. alcune variabili sono categorici e non descrittivi quindi devo renderle descrittive es dummies

###2.2 Choose your features (variables)!

###**Questions**:
1. What is the importance of normalizing the data in your analysis, and how does it impact the effectiveness of the clustering algorithms you plan to use?
2. If you find that normalizing the values is beneficial, please proceed to normalize the data. To simplify this task, refer to the [scikit-learn](https://scikit-learn.org/stable/modules/preprocessing.html) package for tools and functions that facilitate data normalization.
3. Could you provide some insights on dimensionality reduction? What techniques would be effective for reducing the number of features in the dataset, and why might this be beneficial for the analysis?
4. If you believe dimensionality reduction would be advantageous, please select a method to reduce the dimensionality of the data.


###**Answer:**
1. Normalisation is very important when you have variables with different scale in your dataset. Distance metrics (e.g. Euclidean distance in K-means clustering) on which Clustering algorithms like K-means, depend are skewed by features whose range is larger than others allowing the former to dominate the latter in results. The objective of data normalization is to bring all the features to the same level and thus preventing some of the features to dominate others in clustering.
2. In the following code we will normalize the data:

In [85]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
import pandas as pd

# Define the columns to be normalized
standard_features = ['year']  # StandardScaler for 'year'
minmax_features = ['ratings_avg', 'ratings_count', 'tag_count']  # MinMaxScaler for ratings-related columns

# Create the ColumnTransformer to apply different scalers to specific columns
preprocessor = ColumnTransformer(
    transformers=[
        ('standard', StandardScaler(), standard_features),  # Apply StandardScaler to 'release_year'
        ('minmax', MinMaxScaler(), minmax_features)          # Apply MinMaxScaler to the other numerical columns
    ],
    remainder='passthrough'  # Keep other columns unchanged
)

# Apply transformations to the dataset
normalized_data = preprocessor.fit_transform(movies_features)

# Get the column names (after transformation, preserving original and new columns)
columns = standard_features + minmax_features + [col for col in movies_features.columns if col not in (standard_features + minmax_features)]

# Create a new DataFrame with the transformed data
movies_features_normalized = pd.DataFrame(normalized_data, columns=columns)

# Combine the genre columns with the additional features into a new DataFrame
movie_df = movies_features_normalized[additional_features + genre_columns]

# Display the first few rows of the new DataFrame
print(movie_df.head())



  movieId ratings_avg ratings_count common_user_tag tag_count      year  \
0       1    0.784248        0.7383           Pixar  0.218656  0.118376   
1       2    0.642395      0.330456  Robin Williams  0.061685  0.118376   
2       3    0.630208      0.189199           moldy  0.009027  0.118376   
3       4    0.572279      0.040945      characters  0.003009  0.118376   
4       5    0.612918      0.180672    steve martin  0.011033  0.118376   

  (no genres listed) Action Adventure Animation  ... Film-Noir Horror IMAX  \
0                  0      0         1         1  ...         0      0    0   
1                  0      0         1         0  ...         0      0    0   
2                  0      0         0         0  ...         0      0    0   
3                  0      0         0         0  ...         0      0    0   
4                  0      0         0         0  ...         0      0    0   

  Musical Mystery Romance Sci-Fi Thriller War Western  
0       0       0       

3. Dimensionality reduction simplifies the dataset by reducing the number of features while preserving as much variance as possible. This is crucial because:
  * Introduction of High-dimensional data could cause overfitting, leading to decreased clustering algorithm effectiveness.
  * Reducing the dimensions improves the interpretability of the results and speeds up the calculations.

  Effective techniques include:
 PCA (Principal Component Analysis)
 It projects data onto a smaller set of components, capturing more variance and it's good when features are related.

4. Here's how to apply PCA:



Embedding a column like common_user_tag (a categorical feature) into a numerical representation for PCA

In [100]:
from gensim.models import Word2Vec
import pandas as pd

# Handle missing values
movie_df['common_user_tag'] = movie_df['common_user_tag'].fillna('no_value')

# Tokenize tags
tag_sentences = movie_df['common_user_tag'].str.split().tolist()

# Train Word2Vec model
embedding_dim = 30
model = Word2Vec(tag_sentences, vector_size=embedding_dim, window=5, min_count=1, workers=4)

# Generate embeddings
movie_df['common_tag_embedding'] = movie_df['common_user_tag'].apply(
    lambda x: model.wv[x.split()[0]] if isinstance(x, str) and x.split()[0] in model.wv else [0] * embedding_dim
)

# Drop existing embedding columns if they already exist
embedding_columns = [f'common_tag_emb_{i}' for i in range(embedding_dim)]
movie_df.drop(columns=embedding_columns, errors='ignore', inplace=True)

# Expand embeddings into separate columns
embedding_df = pd.DataFrame(movie_df['common_tag_embedding'].tolist(), columns=embedding_columns)

# Merge embeddings into the main DataFrame (only once)
movie_df = pd.concat([movie_df.reset_index(drop=True), embedding_df.reset_index(drop=True)], axis=1)

# Validate
print(movie_df[embedding_columns].isnull().sum())  # Ensure no NaN values


common_tag_emb_0     0
common_tag_emb_1     0
common_tag_emb_2     0
common_tag_emb_3     0
common_tag_emb_4     0
common_tag_emb_5     0
common_tag_emb_6     0
common_tag_emb_7     0
common_tag_emb_8     0
common_tag_emb_9     0
common_tag_emb_10    0
common_tag_emb_11    0
common_tag_emb_12    0
common_tag_emb_13    0
common_tag_emb_14    0
common_tag_emb_15    0
common_tag_emb_16    0
common_tag_emb_17    0
common_tag_emb_18    0
common_tag_emb_19    0
common_tag_emb_20    0
common_tag_emb_21    0
common_tag_emb_22    0
common_tag_emb_23    0
common_tag_emb_24    0
common_tag_emb_25    0
common_tag_emb_26    0
common_tag_emb_27    0
common_tag_emb_28    0
common_tag_emb_29    0
dtype: int64


In [92]:
movie_df.isna().sum()

Unnamed: 0,0
movieId,0
ratings_avg,0
ratings_count,0
common_user_tag,0
tag_count,0
...,...
common_tag_emb_25,0
common_tag_emb_26,0
common_tag_emb_27,0
common_tag_emb_28,0


In [104]:
from sklearn.decomposition import PCA

# Define the features for PCA
pca_features = ['ratings_avg', 'ratings_count', 'tag_count', 'year'] + genre_columns + embedding_columns


# Apply PCA to reduce dimensionality
pca = PCA(n_components=2)
pca_data = pca.fit_transform(movie_df[pca_features])

# Create a DataFrame for the PCA-transformed data
pca_df = pd.DataFrame(pca_data, columns=[f'PC{i+1}' for i in range(2)])

# Add non-PCA columns back to the DataFrame for reference
pca_df['movieId'] = movie_df['movieId']

# Reorder the columns to make 'movieId' the first column
columns = ['movieId'] + [col for col in pca_df.columns if col != 'movieId']
pca_df = pca_df[columns]

# Display the explained variance ratio to evaluate PCA effectiveness
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
print("Cumulative Explained Variance:", sum(pca.explained_variance_ratio_))

# Display the first few rows of the reduced DataFrame
print(pca_df.head())


Explained Variance Ratio: [0.36645357 0.11059574]
Cumulative Explained Variance: 0.47704930290277375
  movieId       PC1       PC2
0       1  0.129599 -0.986710
1       2  0.119785 -0.406732
2       3  0.108251 -0.722260
3       4  0.106095  0.109536
4       5  0.118393 -0.759064


##2.3 Clustering

