In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt


# movies dataset

## cargar el dataset y exploración básica

In [8]:
ruta_users = os.path.join("data", "ml-1m", "movies.dat")
df = pd.read_csv(ruta_users, sep='::', index_col=0, engine='python', encoding='latin-1', header= None, names= ['number', 'title', 'gender'])
df.head(10)

Unnamed: 0_level_0,title,gender
number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy
6,Heat (1995),Action|Crime|Thriller
7,Sabrina (1995),Comedy|Romance
8,Tom and Huck (1995),Adventure|Children's
9,Sudden Death (1995),Action
10,GoldenEye (1995),Action|Adventure|Thriller


In [11]:
df.sample(5)

Unnamed: 0_level_0,title,gender
number,Unnamed: 1_level_1,Unnamed: 2_level_1
26,Othello (1995),Drama
1942,All the King's Men (1949),Drama
81,Things to Do in Denver when You're Dead (1995),Crime|Drama|Romance
2033,"Black Cauldron, The (1985)",Animation|Children's
326,To Live (Huozhe) (1994),Drama


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3883 entries, 1 to 3952
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   3883 non-null   object
 1   gender  3883 non-null   object
dtypes: object(2)
memory usage: 91.0+ KB


In [10]:
df.describe(include='all')

Unnamed: 0,title,gender
count,3883,3883
unique,3883,301
top,Toy Story (1995),Drama
freq,1,843


## Arreglando y parseando algunos datos

In [23]:
df['year'] = df['title'].str[-5:-1]
df['title'] = df['title'].str[:-7]
df.head()

Unnamed: 0_level_0,title,gender,year
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Toy Story,Animation|Children's|Comedy,1995
2,Jumanji,Adventure|Children's|Fantasy,1995
3,Grumpier Old Men,Comedy|Romance,1995
4,Waiting to Exhale,Comedy|Drama,1995
5,Father of the Bride Part II,Comedy,1995


In [28]:
df['year'] = df['year'].astype('int')

In [29]:
df.dtypes

title     object
gender    object
year       int32
dtype: object

## jugando con el dataset

In [24]:
len(df[df['gender'] == 'Drama']) # cuantas peliculas de puro drama

843

In [25]:
mask = df['gender'].str.contains('Drama') # cuantas peliculas que sean drama o drama con otro género
# print(mask)
len(df[mask])

1603

In [39]:
df.groupby('year')['title'].value_counts()

year  title                                               
1919  Daddy Long Legs                                         1
      Male and Female                                         1
      Spiders, The (Die Spinnen, 1. Teil: Der Goldene See)    1
1920  Dog's Life, A                                           1
      Saphead, The                                            1
                                                             ..
2000  Adventures of Rocky and Bullwinkle, The                 1
      28 Days                                                 1
      3 Strikes                                               1
      About Adam                                              1
      X-Men                                                   1
Name: count, Length: 3883, dtype: int64

# Ratings dataset

In [33]:
ruta_users = os.path.join("data", "ml-1m", "ratings.dat")

ratings = pd.read_csv(ruta_users, sep= '::', engine= 'python', encoding= 'latin-1', header= None,
                      names= ['User_id','movie_id', 'rating', 'timestamp'], nrows= 1_000_209 )
ratings.head()

Unnamed: 0,User_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [34]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype
---  ------     --------------    -----
 0   User_id    1000209 non-null  int64
 1   movie_id   1000209 non-null  int64
 2   rating     1000209 non-null  int64
 3   timestamp  1000209 non-null  int64
dtypes: int64(4)
memory usage: 30.5 MB


## Analizando el dataset ratings.dat, ¿hay algún usuario que no tenga ninguna review? ¿Cuántos tienen menos de 30 reviews?

In [35]:
condition = ratings.groupby('User_id').count()== 0

ratings[condition]['movie_id'].isna().value_counts()

# todos los valores son Nan por lo que no hay ningun user con 0 ratings

movie_id
True    1000209
Name: count, dtype: int64

In [36]:
condition = ratings.groupby('User_id').count() < 30

ratings[condition]['movie_id'].notna().value_counts() # 751 users tienen menos de 30 reviews

movie_id
False    999458
True        751
Name: count, dtype: int64