In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from re import sub
from decimal import Decimal


***


#### Cleaning up raw dataframe: erroneous 1st column name, missing year in Release Date, replace NaNs

In [8]:
##years = list(range(1977,2020))
years = list(range(1982,1983))
yr    = str(years[0])
df_raw = pd.read_csv(yr+'_movies_features_TEST.csv')

# rename erroneous 1st column:
df_raw.rename(columns={ df_raw.columns[0]: yr+' Rank' }, inplace = True)
df_new = pd.DataFrame()

# Add year for Release Date column
df_raw['Release Date'] = [i+' '+yr for i in df_raw['Release Date']]

# Replace NaNs
df_raw['Budget'] = df_raw['Budget'].fillna('0')
df_raw['MPAA']   = df_raw['MPAA'].fillna('0')

In [14]:
df_raw.head()

Unnamed: 0,1982 Rank,Release,Calendar Gross,Max Theaters,Total Gross,Release Date,Distributor,URL,Budget,Genres,MPAA
0,1,E.T. the Extra-Terrestrial,"$314,911,094",1778,"$359,197,037",Jun 11 1982,Universal Pictures\n\n,boxofficemojo.com/release/rl995132929/,"$10,500,000",Family Sci-Fi,PG
1,2,Raiders of the Lost Ark,"$190,360,115",1078,"$212,222,025",Jun 12 1982,Paramount Pictures\n\n,boxofficemojo.com/release/rl4083844609/,"$18,000,000",Action Adventure,PG
2,3,Rocky III,"$124,146,897",1317,"$124,146,897",May 28 1982,United Artists\n\n,boxofficemojo.com/release/rl3010233857/,0,Drama Sport,0
3,4,On Golden Pond,"$118,720,608",1015,"$119,285,432",Dec 4 1982,Universal Pictures\n\n,boxofficemojo.com/release/rl2320795137/,0,Drama,0
4,5,An Officer and a Gentleman,"$108,402,267",1050,"$129,795,554",Jul 30 1982,Paramount Pictures\n\n,boxofficemojo.com/release/rl423790081/,0,Drama Romance,0



***


#### Assign movies a unique numerical ID

In [3]:
movie_prefix = years[0]*10000

df_new['MovieID'] = [movie_prefix+int(i) for i in df_raw[yr+' Rank']]
#print(df_new)

      MovieID
0    19820001
1    19820002
2    19820003
3    19820004
4    19820005
5    19820006
6    19820007
7    19820008
8    19820009
9    19820010
10   19820011
11   19820012
12   19820013
13   19820014
14   19820015
15   19820016
16   19820017
17   19820018
18   19820019
19   19820020
20   19820021
21   19820022
22   19820023
23   19820024
24   19820025
25   19820026
26   19820027
27   19820028
28   19820029
29   19820030
..        ...
102  19820103
103  19820104
104  19820105
105  19820106
106  19820107
107  19820108
108  19820109
109  19820110
110  19820111
111  19820112
112  19820113
113  19820114
114  19820115
115  19820116
116  19820117
117  19820118
118  19820119
119  19820120
120  19820121
121  19820122
122  19820123
123  19820124
124  19820125
125  19820126
126  19820127
127  19820128
128  19820129
129  19820130
130  19820131
131  19820132

[132 rows x 1 columns]


In [4]:
df_raw.columns

Index(['1982 Rank', 'Release', 'Calendar Gross', 'Max Theaters', 'Total Gross',
       'Release Date', 'Distributor', 'URL', 'Budget', 'Genres', 'MPAA'],
      dtype='object')


***


#### Convert to numerical format: Gross box office values, # Theaters, Budget, and MPAA rating; fix datetime column

In [9]:
df_new['Calendar Gross'] = [Decimal(sub(r'[^\d.]', '', g)) for g in df_raw['Calendar Gross']]
df_new['Total Gross']    = [Decimal(sub(r'[^\d.]', '', g)) for g in df_raw['Total Gross']]
df_new['Max Theaters']   = [int(s.replace(',','')) for s in df_raw['Max Theaters']]

df_new['Budget'] = [int(s.replace('$','').replace(',','')) for s in df_raw['Budget']]

temp_R = [rating.replace('NC-17','5').replace('R','4').replace('PG-13','3').replace('PG','2').replace('G','1') for rating in df_raw['MPAA']]
df_new['MPAA_numeric'] = [int(i) for i in temp_R]

temp_DT = [datetime.strptime(date, '%b %d %Y') for date in df_raw['Release Date']]
df_new['Release Date'] = [item.strftime('%d-%m-%Y') for item in temp_DT]

# Append unchanged columns to new dataframe:
df_new[['Distributor','Genres']] = df_raw[['Distributor','Genres']]
display(df_new)

Unnamed: 0,Calendar Gross,Total Gross,Max Theaters,Budget,MPAA_numeric
0,314911094,359197037,1778,10500000,2
1,190360115,212222025,1078,18000000,2
2,124146897,124146897,1317,0,0
3,118720608,119285432,1015,0,0
4,108402267,129795554,1050,0,0
5,105492483,105492483,1605,0,0
6,91089162,95461682,701,0,0
7,78912963,78912963,1621,11200000,2
8,73555425,76606280,1060,0,0
9,69701637,69701637,1435,0,0



***


In [None]:
from datetime import datetime

In [None]:
##df_raw['Release Date'] = [i+' '+yr for i in df_raw['Release Date']]

a = [datetime.strptime(date, '%b %d %Y') for date in df_raw['Release Date']]
#print(date_obj.strftime('%d-%m-%Y'))

In [None]:
print(yr)

In [11]:

a = [datetime.strptime(date, '%b %d %Y') for date in df_raw['Release Date']]
b = [item.strftime('%d-%m-%Y') for item in a]
#print(a)
print(b)

['11-06-1982', '12-06-1982', '28-05-1982', '04-12-1982', '30-07-1982', '19-03-1982', '17-07-1982', '04-06-1982', '04-06-1982', '23-07-1982', '25-09-1982', '21-05-1982', '22-10-1982', '18-06-1982', '14-05-1982', '23-04-1982', '20-11-1982', '04-12-1982', '12-03-1982', '13-08-1982', '06-11-1982', '09-12-1982', '18-12-1982', '17-12-1982', '16-07-1982', '09-07-1982', '23-07-1982', '19-03-1982', '25-06-1982', '13-08-1982', '18-12-1982', '10-12-1982', '02-04-1982', '04-06-1982', '13-08-1982', '06-08-1982', '10-12-1982', '12-11-1982', '18-12-1982', '30-07-1982', '08-10-1982', '16-12-1982', '25-06-1982', '25-12-1982', '19-03-1982', '16-07-1982', '21-05-1982', '10-12-1982', '16-07-1982', '21-05-1982', '17-12-1982', '23-07-1982', '12-02-1982', '17-12-1982', '11-06-1982', '13-08-1982', '20-11-1982', '22-10-1982', '05-03-1982', '20-08-1982', '19-11-1982', '28-05-1982', '22-01-1982', '18-06-1982', '10-12-1982', '28-08-1982', '24-09-1982', '02-07-1982', '22-10-1982', '12-02-1982', '19-02-1982', '22-0

In [13]:
df_new.head()

Unnamed: 0,Calendar Gross,Total Gross,Max Theaters,Budget,MPAA_numeric,Release Date
0,314911094,359197037,1778,10500000,2,11-06-1982
1,190360115,212222025,1078,18000000,2,12-06-1982
2,124146897,124146897,1317,0,0,28-05-1982
3,118720608,119285432,1015,0,0,04-12-1982
4,108402267,129795554,1050,0,0,30-07-1982


In [15]:
# Append unchanged columns to new dataframe:
df_new[['Distributor','Genres']] = df_raw[['Distributor','Genres']]
display(df_new)

Unnamed: 0,Calendar Gross,Total Gross,Max Theaters,Budget,MPAA_numeric,Release Date,Distributor,Genres
0,314911094,359197037,1778,10500000,2,11-06-1982,Universal Pictures\n\n,Family Sci-Fi
1,190360115,212222025,1078,18000000,2,12-06-1982,Paramount Pictures\n\n,Action Adventure
2,124146897,124146897,1317,0,0,28-05-1982,United Artists\n\n,Drama Sport
3,118720608,119285432,1015,0,0,04-12-1982,Universal Pictures\n\n,Drama
4,108402267,129795554,1050,0,0,30-07-1982,Paramount Pictures\n\n,Drama Romance
5,105492483,105492483,1605,0,0,19-03-1982,Twentieth Century Fox\n\n,Comedy
6,91089162,95461682,701,0,0,17-07-1982,Warner Bros.\n\n,Comedy Romance
7,78912963,78912963,1621,11200000,2,04-06-1982,Paramount Pictures\n\n,Action Adventure Sci-Fi
8,73555425,76606280,1060,0,0,04-06-1982,Metro-Goldwyn-Mayer (MGM)\n\n,Horror Thriller
9,69701637,69701637,1435,0,0,23-07-1982,Universal Pictures\n\n,Comedy Musical
