# Data Cleaning

In [10]:
import pandas as pd

df = pd.read_csv('../data/movies.csv')

# compute the IQR
popularity = df['popularity']
q1 = popularity.quantile(0.25)
q3 = popularity.quantile(0.75)
popularity_iqr = q3 - q1

# remove unnecessary columns
df.drop(columns=['adult', 'backdrop_path', 'poster_path', 'release_date', 'video', 'genre_ids'], inplace=True)

# remove any rows with < 10 votes
# remove any outlier popularity values
# remove any rows with original_language != 'en'
for index, row in df.iterrows():
    vote_count = row['vote_count']
    popularity = row['popularity']
    original_language = row['original_language']
    if vote_count <= 10 or original_language != 'en' or popularity > q3 + 1.5 * popularity_iqr or popularity < q1 - 1.5 * popularity_iqr: 
        print(f"Removing row {index} with vote_count={vote_count}, popularity={popularity}, original_language={original_language}")
        df.drop(index, inplace=True)

# remove columns only used for filtering
df.drop(columns=['vote_count', 'original_language'], inplace=True)

df.to_csv('../data/movies_cleaned.csv', index=False)
df.describe()

Removing row 0 with vote_count=201, popularity=5741.978, original_language=en
Removing row 1 with vote_count=451, popularity=4257.756, original_language=en
Removing row 2 with vote_count=950, popularity=3944.132, original_language=en
Removing row 3 with vote_count=875, popularity=2416.392, original_language=en
Removing row 4 with vote_count=132, popularity=1852.056, original_language=zh
Removing row 5 with vote_count=215, popularity=1653.85, original_language=ko
Removing row 6 with vote_count=1335, popularity=1442.875, original_language=en
Removing row 7 with vote_count=41, popularity=1256.122, original_language=id
Removing row 8 with vote_count=302, popularity=1281.197, original_language=pt
Removing row 9 with vote_count=4302, popularity=1177.316, original_language=en
Removing row 10 with vote_count=1041, popularity=1111.223, original_language=en
Removing row 11 with vote_count=24, popularity=1091.648, original_language=en
Removing row 12 with vote_count=453, popularity=922.92, origin

Unnamed: 0,id,popularity,vote_average
count,6507.0,6507.0,6507.0
mean,223087.7,25.875517,6.455688
std,301292.4,11.720481,0.794394
min,5.0,9.95,2.895
25%,10052.0,17.486,5.975
50%,32823.0,22.174,6.492
75%,408884.5,31.149,7.0
max,1237835.0,63.605,8.675
