In [None]:
import pandas as pd

# Load datasets
movies = pd.read_csv('/content/movies (1).csv')
ratings = pd.read_csv('/content/ratings (1).csv')

# Display first few rows of each dataset to understand the structure
print("Movies dataset:")
print(movies.head())

print("\nRatings dataset:")
print(ratings.head())

# Merge datasets based on a common column (e.g., movieId)
merged_data = pd.merge(movies, ratings, on='movieId')

# Display first few rows of the merged dataset
print("\nMerged dataset:")
print(merged_data.head())

# Save the merged dataset to a new CSV file if needed
merged_data.to_csv('merged_data.csv', index=False)


Movies dataset:
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   Adventure|Children|Fantasy  
2                               Comedy|Romance  
3                         Comedy|Drama|Romance  
4                                       Comedy  

Ratings dataset:
   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931

Merged dataset:
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation

In [None]:
!pip install pyspark



from pyspark.sql import SparkSession
from pyspark.ml.feature import CountVectorizer, VectorAssembler
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import split, size, udf
from pyspark.sql.types import DoubleType

# Create a Spark session
spark = SparkSession.builder.appName("CosineSimilarity").getOrCreate()
df = spark.read.csv('/content/merged_data.csv', header=True, inferSchema=True)
# Assuming 'df' is your DataFrame with columns: movieId, title, genres, userId, rating, timestamp
selected_columns = df.select("title", "genres", "userId")

# Split 'title' and 'genres' into arrays of strings
selected_columns = selected_columns.withColumn("title_array", split("title", " "))
selected_columns = selected_columns.withColumn("genres_array", split("genres", "\\|"))

# Define a function to calculate cosine similarity between two vectors
def cosine_similarity(v1, v2):
    dot_product = float(v1.dot(v2))
    norm_v1 = float(v1.norm(2))
    norm_v2 = float(v2.norm(2))
    similarity = dot_product / (norm_v1 * norm_v2)
    return similarity if (norm_v1 != 0 and norm_v2 != 0) else 0.0

cosine_similarity_udf = udf(cosine_similarity, DoubleType())

# Create CountVectorizer models for each column
cv_title = CountVectorizer(inputCol="title_array", outputCol="title_vector")
cv_model_title = cv_title.fit(selected_columns)
selected_columns = cv_model_title.transform(selected_columns)

cv_genres = CountVectorizer(inputCol="genres_array", outputCol="genres_vector")
cv_model_genres = cv_genres.fit(selected_columns)
selected_columns = cv_model_genres.transform(selected_columns)

# Convert userId to a vector format
selected_columns = selected_columns.withColumn("userId_vector", col("userId").cast(DoubleType()))

# Calculate cosine similarity between 'title_vector' and 'genres_vector', and 'userId_vector'
selected_columns = selected_columns.withColumn(
    "cosine_sim_title_genres",
    cosine_similarity_udf("title_vector", "genres_vector")
)

selected_columns = selected_columns.withColumn(
    "cosine_sim_title_userId",
    cosine_similarity_udf("title_vector", "userId_vector")
)

selected_columns = selected_columns.withColumn(
    "cosine_sim_genres_userId",
    cosine_similarity_udf("genres_vector", "userId_vector")
)

# Show the DataFrame with cosine similarity values
selected_columns.select("title", "genres", "userId", "cosine_sim_title_genres", "cosine_sim_title_userId", "cosine_sim_genres_userId").show()
