In [37]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

## Lo primero que haremos es importar nuestro csv de 'film' para limpiar valores nulos y duplicados

In [38]:
data_ori = pd.read_csv('/Users/christelllameda/ironhack/Proyecto-Nro-2/data/film.csv', encoding='latin1')

data = data_ori.copy()

In [39]:
data.head(5)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features,last_update
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes",2006-02-15 05:03:42
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,,3,4.99,48,12.99,G,"Trailers,Deleted Scenes",2006-02-15 05:03:42
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes",2006-02-15 05:03:42
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,1,,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes",2006-02-15 05:03:42
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,1,,6,2.99,130,22.99,G,Deleted Scenes,2006-02-15 05:03:42


## Chequeamos los valores nulos

In [40]:
nan_cols = data.isna().sum()

nan_cols[nan_cols>0]

original_language_id    1000
dtype: int64

## La única columna con valores nulos es la de 'original_language_id' por lo que la eliminaremos

In [41]:
data = data.drop(columns=['original_language_id'])

## La columna last_update posee los mismos valores para todas las filas, por lo que no es relevante y la eliminamos

In [42]:
data = data.drop(columns=['last_update'])

## Chequeamos si hay duplicados

In [43]:
data.duplicated().any()

False

## Para optimizar la memoria

In [44]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   film_id           1000 non-null   int64  
 1   title             1000 non-null   object 
 2   description       1000 non-null   object 
 3   release_year      1000 non-null   int64  
 4   language_id       1000 non-null   int64  
 5   rental_duration   1000 non-null   int64  
 6   rental_rate       1000 non-null   float64
 7   length            1000 non-null   int64  
 8   replacement_cost  1000 non-null   float64
 9   rating            1000 non-null   object 
 10  special_features  1000 non-null   object 
dtypes: float64(2), int64(5), object(4)
memory usage: 413.3 KB


In [45]:
for c in data.select_dtypes(include='integer'):
    
    data[c]=pd.to_numeric(data[c], downcast='integer')
    
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   film_id           1000 non-null   int16  
 1   title             1000 non-null   object 
 2   description       1000 non-null   object 
 3   release_year      1000 non-null   int16  
 4   language_id       1000 non-null   int8   
 5   rental_duration   1000 non-null   int8   
 6   rental_rate       1000 non-null   float64
 7   length            1000 non-null   int16  
 8   replacement_cost  1000 non-null   float64
 9   rating            1000 non-null   object 
 10  special_features  1000 non-null   object 
dtypes: float64(2), int16(3), int8(2), object(4)
memory usage: 382.0 KB


In [46]:
for c in data.select_dtypes(include='float'):
    
    data[c]=pd.to_numeric(data[c], downcast='float')

data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   film_id           1000 non-null   int16  
 1   title             1000 non-null   object 
 2   description       1000 non-null   object 
 3   release_year      1000 non-null   int16  
 4   language_id       1000 non-null   int8   
 5   rental_duration   1000 non-null   int8   
 6   rental_rate       1000 non-null   float32
 7   length            1000 non-null   int16  
 8   replacement_cost  1000 non-null   float32
 9   rating            1000 non-null   object 
 10  special_features  1000 non-null   object 
dtypes: float32(2), int16(3), int8(2), object(4)
memory usage: 374.2 KB


In [47]:
data.to_csv('film_clean.csv', index=False, sep='\t')

In [48]:
data.head(5)

Unnamed: 0,film_id,title,description,release_year,language_id,rental_duration,rental_rate,length,replacement_cost,rating,special_features
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,1,6,0.99,86,20.99,PG,"Deleted Scenes,Behind the Scenes"
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,1,3,4.99,48,12.99,G,"Trailers,Deleted Scenes"
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,1,7,2.99,50,18.99,NC-17,"Trailers,Deleted Scenes"
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,1,5,2.99,117,26.99,G,"Commentaries,Behind the Scenes"
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,1,6,2.99,130,22.99,G,Deleted Scenes
