# Prepare for data import
- Determine how the CSV file will be structured  
- Determine if normalized or denormalized data  
- Ensure IDs to be used in the data are unique  
- Ensure data in CSV files is "clean"
- Execute Cypher code to inspect the data
- Determine if data needs to be transformed

#### Load data

In [9]:
import pandas as pd

movies = pd.read_csv("dataset/movies.csv")
ratings = pd.read_csv("dataset/ratings.csv")
tags = pd.read_csv("dataset/tags.csv")

### Some info from data

In [10]:
# Vertex of the graph
print(f"\nMovies size is {movies.shape}")
movies.info()

print(f"\nRatings size is {ratings.shape}")
ratings.info()

print(f"\nTags size is {tags.shape}")
tags.info()

# To do preprocessing


Movies size is (9742, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB

Ratings size is (100836, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB

Tags size is (3683, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3683 entries, 0 to 3682
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     -----------

We have no missing value.  
The structure of data is normalized and csv structure is OK.  
We don't have users for privacy and so we will create them after some check.  

In [11]:
print(f"Users that do ratings: {ratings['userId'].value_counts().shape}")
print(f"Users that do tags: {tags['userId'].value_counts().shape}")
print(f"Users that did only tags: {~tags['userId'].isin(ratings['userId']).all()}")
users = ratings["userId"].value_counts().reset_index()
print(f"So total users is {users['userId'].shape}")
users
# TODO: can add counts of ratings and tags to the users dataframe

Users that do ratings: (610,)
Users that do tags: (58,)
Users that did only tags: False
So total users is (610,)


Unnamed: 0,userId,count
0,414,2698
1,599,2478
2,474,2108
3,448,1864
4,274,1346
...,...,...
605,207,20
606,442,20
607,53,20
608,576,20


We have created users

### Check Movies

TODO:
- Check if ids are unique
- Check id genres is splitted and if there is some movie without genres
- Check title --> there is year information

In [12]:
print("Movies")
print(f"Unique movieId? {movies['movieId'].nunique() == movies.shape[0]}")
print(movies["genres"].head(10))

Movies
Unique movieId? True
0    Adventure|Animation|Children|Comedy|Fantasy
1                     Adventure|Children|Fantasy
2                                 Comedy|Romance
3                           Comedy|Drama|Romance
4                                         Comedy
5                          Action|Crime|Thriller
6                                 Comedy|Romance
7                             Adventure|Children
8                                         Action
9                      Action|Adventure|Thriller
Name: genres, dtype: object


In [13]:
# Check ratings
print(f"Are there any negative ratings? {ratings[ratings['rating']<0].size!=0}")
print(f"Are there any voting above 5? {ratings[ratings['rating']>5].size!=0}")
print(f"Are ratings all integer? {ratings['rating'].apply(float.is_integer).all()}")

Are there any negative ratings? False
Are there any voting above 5? False
Are ratings all integer? False


In [14]:
import re

a = "ciao (2019)"
if re.search(r"\(\d{4}-\d{4}\)", a):
    print(a)

# print title that doesn't have year
print(movies[~movies["title"].str.contains(r"\(\d{4}\)")])

      movieId                                              title  \
6059    40697                                          Babylon 5   
9031   140956                                   Ready Player One   
9091   143410                                         Hyena Road   
9138   147250  The Adventures of Sherlock Holmes and Doctor W...   
9179   149334                                  Nocturnal Animals   
9259   156605                                           Paterson   
9367   162414                                          Moonlight   
9448   167570                                             The OA   
9514   171495                                             Cosmos   
9515   171631                            Maria Bamford: Old Baby   
9518   171749                  Death Note: Desu nôto (2006–2007)   
9525   171891                                  Generation Iron 2   
9611   176601                                       Black Mirror   

                      genres  
6059            

In [15]:
import re

a = "ciao 1904"
a = re.search(".([0-9]{4})", a)
print(a)
# movies["title"].apply(lambda x: re.search(".([0-9])", x))
# Check if you can extract the year from the title
# movies["year"] = movies["title"].apply(lambda x: re.search(".([0-9]{4})", x))
# movies["year"]

<re.Match object; span=(4, 9), match=' 1904'>


## Move data on volume container

In [16]:
import os

path_to_save = "data"

if not os.path.exists(path_to_save):
    os.makedirs(path_to_save)

users.to_csv(f"{path_to_save}/users.csv", index=False)
movies.to_csv(f"{path_to_save}/movies.csv", index=False)
ratings.to_csv(f"{path_to_save}/ratings.csv", index=False)
tags.to_csv(f"{path_to_save}/tags.csv", index=False)