# Prepare for data import
- Determine how the CSV file will be structured  
- Determine if normalized or denormalized data  
- Ensure IDs to be used in the data are unique  
- Ensure data in CSV files is "clean"
- Execute Cypher code to inspect the data
- Determine if data needs to be transformed

# Load data

In [1]:
import pandas as pd

movies = pd.read_csv("dataset/ml-latest/movies.csv")
ratings = pd.read_csv("dataset/ml-latest/ratings.csv")
tags = pd.read_csv("dataset/ml-latest/tags.csv")

# Get info from data

In [2]:
# Vertex of the graph
print(f"\nMovies size is {movies.shape}")
movies.info()

print(f"\nRatings size is {ratings.shape}")
ratings.info()

print(f"\nTags size is {tags.shape}")
tags.info()

# To do preprocessing


Movies size is (86537, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86537 entries, 0 to 86536
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  86537 non-null  int64 
 1   title    86537 non-null  object
 2   genres   86537 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.0+ MB

Ratings size is (33832162, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33832162 entries, 0 to 33832161
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 1.0 GB

Tags size is (2328315, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2328315 entries, 0 to 2328314
Data columns (total 4 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   userId     int64 
 1   movieId    int64 
 2   tag        object
 3   timestamp  int64 
dtypes: int64(

### Check Movies

TODO:
- Check if ids are unique
- Check id genres is splitted and if there is some movie without genres
- Check title --> there is year information

In [3]:
print("Movies")
print(f"Unique movieId? {movies['movieId'].nunique() == movies.shape[0]}")
print(movies["genres"].head(10))

Movies
Unique movieId? True
0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
4                                         Comedy
5                          Action|Crime|Thriller
6                                 Comedy|Romance
7                             Adventure|Children
8                                         Action
9                      Action|Adventure|Thriller
Name: genres, dtype: object


In [4]:
# Check ratings
print(f"Are there any negative ratings? {ratings[ratings['rating']<0].size!=0}")
print(f"Are there any voting above 5? {ratings[ratings['rating']>5].size!=0}")
print(f"Are ratings all integer? {ratings['rating'].apply(float.is_integer).all()}")

Are there any negative ratings? False
Are there any voting above 5? False
Are ratings all integer? False


# Process data

## Create users list
We have no missing value.  
The structure of data is normalized and csv structure is OK.  
We don't have users for privacy and so we will create them after some check.  

In [5]:
print(f"Users that do ratings: {ratings['userId'].value_counts().shape}")
print(f"Users that do tags: {tags['userId'].value_counts().shape}")
print(f"Users that did only tags: {~tags['userId'].isin(ratings['userId']).all()}")
users = ratings["userId"].value_counts().reset_index()
users = users.merge(
    tags["userId"].value_counts().reset_index(), on="userId", how="outer"
)
users.columns = ["userId", "ratings", "tags"]
users = users.fillna(0)
users["tags"] = users["tags"].astype(int)
print(f"So total users is {users['userId'].shape}")
users.head()

Users that do ratings: (330975,)
Users that do tags: (25280,)
Users that did only tags: False
So total users is (330975,)


Unnamed: 0,userId,ratings,tags
0,1,62,0
1,2,91,0
2,3,30,0
3,4,30,0
4,5,43,0


## Add year info

In [6]:
movies["year"] = movies["title"].str.extract(r"\((\d{4})\)")

## Add genre nodes

In [7]:
# From readme file
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "IMAX",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
    "(no genres listed)",
]

# Create links from movies to genres
movies["genres"] = movies["genres"].apply(lambda x: x.split("|"))
movies_genres = movies[["movieId", "genres"]].explode("genres")
movies_genres["genreId"] = movies_genres["genres"].apply(lambda x: genres.index(x) + 1)
movies_genres.drop("genres", axis=1, inplace=True)

# movies dosen't need anymore of genres
movies.drop(columns=["genres"], inplace=True)

# create genres nodes
genres = pd.DataFrame(genres, columns=["name"])
genres["genreId"] = genres.index + 1
genres = genres.reindex(columns=["genreId", "name"])

print(
    f"Are genres been extracted correctly? {movies_genres['genreId'].isin(genres['genresId']).all()}"
)

Are genres been extracted correctly? True


In [8]:
# # Fromatted datetime
# ratings["timestamp"] = pd.to_datetime(
#     ratings["timestamp"], unit="s", utc=True
# ).dt.strftime("%Y-%m-%dT%H:%M:%S%z")
# tags["timestamp"] = pd.to_datetime(tags["timestamp"], unit="s", utc=True).dt.strftime(
#     "%Y-%m-%dT%H:%M:%S%z"
# )

# Move data on volume container


In [25]:
movies

Unnamed: 0,movieId,title,year
0,1,Toy Story (1995),1995
1,2,Jumanji (1995),1995
2,3,Grumpier Old Men (1995),1995
3,4,Waiting to Exhale (1995),1995
4,5,Father of the Bride Part II (1995),1995
...,...,...,...
86532,288967,State of Siege: Temple Attack (2021),2021
86533,288971,Ouija Japan (2021),2021
86534,288975,The Men Who Made the Movies: Howard Hawks (1973),1973
86535,288977,Skinford: Death Sentence (2023),2023


## old import
Prepare data to neo4j-admin-import (info)[https://neo4j.com/docs/operations-manual/current/tools/neo4j-admin/neo4j-admin-import]

In [9]:
# movies.columns = ["movieId:ID(Movie-ID)", "title", "year:int"]
# movies[":LABEL"] = "Movie"

# users.columns = ["userId:ID(User-ID)", "ratings:int", "tags:int"]
# users[":LABEL"] = "User"

# genres.columns = ["genreId:ID(Genre-ID)", "name:string"]
# genres[":LABEL"] = "Genre"

# ratings.columns = [
#     "userId:START_ID(User-ID)",
#     "movieId:END_ID(Movie-ID)",
#     "rating:float",
#     "timestamp:datetime",
# ]
# ratings[":TYPE"] = "RATED"


# tags.columns = [
#     "userId:START_ID(User-ID)",
#     "movieId:END_ID(Movie-ID)",
#     "tag",
#     "timestamp:datetime",
# ]
# tags[":TYPE"] = "TAGGED"

# movies_genres.columns = [
#     "movieId:START_ID(Movie-ID)",
#     "genreId:END_ID(Genre-ID)",
# ]
# movies_genres[":TYPE"] = "IN_GENRE"

# Save on import_to_docker dir

In [21]:
import os

path_to_save = "import_to_docker"

if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)

movies.to_csv(f"{path_to_save}/movies.csv", index=False)
users.to_csv(f"{path_to_save}/users.csv", index=False)
genres.to_csv(f"{path_to_save}/genres.csv", index=False)

ratings.to_csv(f"{path_to_save}/ratings.csv", index=False)
tags.to_csv(f"{path_to_save}/tags.csv", index=False)
movies_genres.to_csv(f"{path_to_save}/movies_genres.csv", index=False)

In [None]:
# TODO: maybe check if tags are unique (upper case and lower case are the same)