In [1]:
import pandas as pd
import numpy as np

In [2]:
def process_data():
    # Do not alter this line.
    biopics = pd.read_csv('./data/biopics.csv', encoding='latin-1')

    # Write your code here.

    # Task a: Filter out duplicated rows
    biopics.drop_duplicates(subset=None, keep='first', inplace=True, ignore_index=False)

    # Task b: rename the variable called box_office to earnings
    biopics.rename(columns={'box_office': 'earnings'}, inplace=True)

    # Task c: Filter out rows for which earnings are missing (i.e. they're NaN)
    biopics = biopics[biopics['earnings'].notna()]
    
    # Task d: movies released year >= 1990
    biopics = biopics[biopics['year_release'] >= 1990]

    # Task e: type_of_subject and country conversion to categorical
    for col in ['type_of_subject', 'country']:
        biopics[col] = biopics[col].astype('category')

    # Task f: new variable lead_actor_actress_known
    
    ## First replace the '-' with np.nan
    biopics['lead_actor_actress'] = np.where((biopics['lead_actor_actress'] == '-'), np.nan, biopics['lead_actor_actress'])
    
    ## then create the col  and fill it with False values
    biopics.loc[biopics['lead_actor_actress'].isna(), 'lead_actor_actress_known'] = False
    
    ## then fill the NaNs with True
    biopics.loc[~biopics['lead_actor_actress'].isna(), 'lead_actor_actress_known'] = True
    
    # Task g: convert earnings to millions of $$$
    biopics['earnings'] = biopics['earnings']/1000000

    # Task h: reorder columns
    cols_reordered = ['title', 'year_release', 'earnings', 'country', 'type_of_subject', 'lead_actor_actress', 'lead_actor_actress_known']
    biopics = biopics.reindex(columns=cols_reordered)

    # Task i: row sorting by earnings in descending order
    biopics.sort_values(by=['earnings'], inplace=True, ascending=False)

    # Remember to return the right object.
    return biopics.reset_index(drop=True)


In [3]:
def check_descending(df, col):
    newlist = list(df[col])
    largest = newlist[0]
    for i in range(len(newlist)):
        if newlist[i] > largest:
            largest = newlist[i]
            print(largest)
            return False
        try:
            if newlist[i] < newlist[i+1]:
                print(newlist[i], newlist[i+1])
                return False
        except IndexError:
            return True

In [4]:
def check_descending(df, col):
    l = list(df[col])
    return all(l[i] >= l[i+1] for i in range(len(l)-1))

### Testing output

In [5]:
def test_cases():
    biopics = process_data()

    # Task a: Filter out duplicated rows
    try:
        assert len(biopics[biopics.duplicated()]) == 0
        print('Pass: duplicates removed')
    except AssertionError:
        print('Failed removing duplicates')
    
    # Task b: rename the variable called box_office to earnings
    try:
        assert 'earnings' in list(biopics.columns) and 'box_office' not in list(biopics.columns)
        print('Pass: renamed "box_office" to "earnings"')
    except AssertionError:
        print('Failed renaming "box_office" to "earnings"')
        
    # Task c: Filter out rows for which earnings are missing (i.e. they're NaN)
    try:
        assert len(biopics[biopics['earnings'].isna()]) == 0
        print('Pass: NaNs removed from earnings')
    except AssertionError:
        print('Failed removing NaNs')
        
    # Task d: movies released year >= 1990
    try:
        assert len(biopics[biopics['year_release'] < 1990]) == 0
        print('Pass: movies released year >= 1990')
    except AssertionError:
        print('Failed movies released year >= 1990')
        
    # Task e: type_of_subject and country conversion to categorical
    try:
        assert biopics.dtypes['type_of_subject'] == 'category' and biopics.dtypes['country'] == 'category'
        print('Pass: type_of_subject and country conversion to categorical')
    except AssertionError:
        print('Failed converting type_of_subject and country to categorical')
        
    # Task f: new variable lead_actor_actress_known
    try:
        assert 'lead_actor_actress_known' in list(biopics.columns)
        print('Pass: lead_actor_actress_known variable created')
    except AssertionError:
        print('Failed creating lead_actor_actress_known')
    
    # Task g: convert earnings to millions of $$$
    try:
        assert len(biopics[biopics['earnings'] > 350]) == 0
        print('Pass: converted earnings to millions of $$$')
    except AssertionError:
        print('Failed converting earnings to millions of $$$')
        
    # Task h: reorder columns
    cols_reordered = ['title', 'year_release', 'earnings', 'country', 'type_of_subject', 'lead_actor_actress', 'lead_actor_actress_known']
    try:
        assert list(biopics.columns) == cols_reordered
        print('Pass: columns reordered')
    except AssertionError:
        print('Failed reordering columns')
    
    # Task i: row sorting by earnings in descending order
    try:
        assert check_descending(biopics, 'earnings')
        print('Pass: earnings sorted')
    except AssertionError:
        print('Failed sorting earnings')

In [6]:
test_cases()

Pass: duplicates removed
Pass: renamed "box_office" to "earnings"
Pass: NaNs removed from earnings
Pass: movies released year >= 1990
Pass: type_of_subject and country conversion to categorical
Pass: lead_actor_actress_known variable created
Pass: converted earnings to millions of $$$
Pass: columns reordered
Pass: earnings sorted
