# Data Cleaning

In [13]:
import pandas as pd

df = pd.read_csv('../data/movies.csv')

# remove unnecessary columns
df.drop(columns=['adult', 'backdrop_path', 'poster_path', 'release_date', 'video', 'genre_ids'], inplace=True)

# compute the IQRs
popularity = df['popularity']
q1 = popularity.quantile(0.25)
q3 = popularity.quantile(0.75)
popularity_iqr = q3 - q1

vote_average = df['vote_average']
q1 = vote_average.quantile(0.25)
q3 = vote_average.quantile(0.75)
vote_average_iqr = q3 - q1

# remove any rows with < 10 votes
# remove any outlier popularity values
# remove any rows with original_language != 'en'
for index, row in df.iterrows():
    vote_count = row['vote_count']
    vote_average = row['vote_average']
    popularity = row['popularity']
    original_language = row['original_language']
    if vote_count <= 10 or original_language != 'en' or popularity > q3 + 1.5 * popularity_iqr or popularity < q1 - 1.5 * popularity_iqr or vote_average > q3 + 1.5 * vote_average_iqr or vote_average < q1 - 1.5 * vote_average_iqr: 
        print(f"Removing row {index} with vote_count={vote_count}, vote_average={vote_average}, popularity={popularity}, original_language={original_language}")
        df.drop(index, inplace=True)

# remove columns only used for filtering
df.drop(columns=['vote_count', 'original_language'], inplace=True)

df.to_csv('../data/movies_cleaned.csv', index=False)
df.describe()

Removing row 0 with vote_count=201, vote_average=7.1, popularity=5741.978, original_language=en
Removing row 1 with vote_count=451, vote_average=6.87, popularity=4257.756, original_language=en
Removing row 2 with vote_count=950, vote_average=7.264, popularity=3944.132, original_language=en
Removing row 3 with vote_count=875, vote_average=5.645, popularity=2416.392, original_language=en
Removing row 4 with vote_count=132, vote_average=7.216, popularity=1852.056, original_language=zh
Removing row 5 with vote_count=215, vote_average=7.158, popularity=1653.85, original_language=ko
Removing row 6 with vote_count=1335, vote_average=7.2, popularity=1442.875, original_language=en
Removing row 7 with vote_count=41, vote_average=6.146, popularity=1256.122, original_language=id
Removing row 8 with vote_count=302, vote_average=6.222, popularity=1281.197, original_language=pt
Removing row 9 with vote_count=4302, vote_average=7.497, popularity=1177.316, original_language=en
Removing row 10 with vote

Unnamed: 0,id,popularity,vote_average
count,5172.0,5172.0,5172.0
mean,214382.9,20.98675,6.420594
std,294238.1,5.780344,0.753591
min,5.0,9.97,4.262
25%,10175.25,16.64175,5.928
50%,27366.5,20.2145,6.4235
75%,400262.5,25.23,6.96775
max,1231942.0,34.605,8.675
