In [1]:
# Import Dependencies
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import boto3
from sqlalchemy import create_engine
from io import StringIO
from config import aws_id, aws_secret, sql_pw

In [2]:
# Extracting Dataset from S3 Bucket
client = boto3.client('s3', aws_access_key_id=aws_id, aws_secret_access_key=aws_secret)
bucket_name = 'utbootcamp-bucket'
object_key = 'datasets_2745_4700_movies.csv'
csv_obj = client.get_object(Bucket=bucket_name, Key=object_key)
body = csv_obj['Body']
csv_string = body.read().decode('utf-8', errors='ignore')
movies_df = pd.read_csv(StringIO(csv_string))
movies_df.head()

Unnamed: 0,budget,company,country,director,genre,gross,name,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,Stand by Me,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,Ferris Bueller's Day Off,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,Top Gun,PG,1986-05-16,110,6.9,Tom Cruise,236909,Jim Cash,1986
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,Aliens,R,1986-07-18,137,8.4,Sigourney Weaver,540152,James Cameron,1986
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,Flight of the Navigator,PG,1986-08-01,90,6.9,Joey Cramer,36636,Mark H. Baker,1986


# Performing inital cleaning of data

In [3]:
# Droping Identification column
movies_df_clean = movies_df.copy()
movies_df_clean = movies_df_clean.drop(["name"], axis=1)
movies_df_clean.head()

Unnamed: 0,budget,company,country,director,genre,gross,rating,released,runtime,score,star,votes,writer,year
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,52287414.0,R,1986-08-22,89,8.1,Wil Wheaton,299174,Stephen King,1986
1,6000000.0,Paramount Pictures,USA,John Hughes,Comedy,70136369.0,PG-13,1986-06-11,103,7.8,Matthew Broderick,264740,John Hughes,1986
2,15000000.0,Paramount Pictures,USA,Tony Scott,Action,179800601.0,PG,1986-05-16,110,6.9,Tom Cruise,236909,Jim Cash,1986
3,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,85160248.0,R,1986-07-18,137,8.4,Sigourney Weaver,540152,James Cameron,1986
4,9000000.0,Walt Disney Pictures,USA,Randal Kleiser,Adventure,18564613.0,PG,1986-08-01,90,6.9,Joey Cramer,36636,Mark H. Baker,1986


In [4]:
# Generate our categorical variable list
movies_cat = movies_df_clean.dtypes[movies_df_clean.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
movies_df_clean[movies_cat].nunique()

company     2179
country       57
director    2759
genre         17
rating        13
released    2403
star        2504
writer      4199
dtype: int64

In [5]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(movies_df_clean[movies_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(movies_cat)
encode_df.head()

Unnamed: 0,"company_""DIA"" Productions GmbH & Co. KG",company_1+2 Seisaku Iinkai,company_101st Street Films,company_10th Hole Productions,company_120 Films,company_13 Productions,company_1492 Pictures,company_1821 Pictures,company_19 Entertainment,company_1984 Private Defense Contractors,...,writer_Zo Lund,writer_Zoe Heller,writer_Zoe Kazan,writer_ke Sandgren,writer_lex Pastor,writer_lex de la Iglesia,writer_lvaro del Amo,writer_mile Gaudreault,writer_ric Rohmer,writer_va Grdos
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Merge one-hot encoded features and drop the originals
movies_df_merged = movies_df.merge(encode_df,left_index=True, right_index=True)
movies_df_merged = movies_df_merged.drop(movies_cat,1)
movies_df_merged.head()

Unnamed: 0,budget,gross,name,runtime,score,votes,year,"company_""DIA"" Productions GmbH & Co. KG",company_1+2 Seisaku Iinkai,company_101st Street Films,...,writer_Zo Lund,writer_Zoe Heller,writer_Zoe Kazan,writer_ke Sandgren,writer_lex Pastor,writer_lex de la Iglesia,writer_lvaro del Amo,writer_mile Gaudreault,writer_ric Rohmer,writer_va Grdos
0,8000000.0,52287414.0,Stand by Me,89,8.1,299174,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,6000000.0,70136369.0,Ferris Bueller's Day Off,103,7.8,264740,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15000000.0,179800601.0,Top Gun,110,6.9,236909,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,18500000.0,85160248.0,Aliens,137,8.4,540152,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9000000.0,18564613.0,Flight of the Navigator,90,6.9,36636,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
# Converting score to recomendation value
movies_df_merged.loc[movies_df_merged.score >= 7.0, "recomendation"] = 2
movies_df_merged.loc[(movies_df_merged.score < 7.0) & (movies_df_merged.score > 5.0), "recomendation"] = 1
movies_df_merged.loc[movies_df_merged.score <= 5.0, "recomendation"] = 0
movies_df_merged.head()

Unnamed: 0,budget,gross,name,runtime,score,votes,year,"company_""DIA"" Productions GmbH & Co. KG",company_1+2 Seisaku Iinkai,company_101st Street Films,...,writer_Zoe Heller,writer_Zoe Kazan,writer_ke Sandgren,writer_lex Pastor,writer_lex de la Iglesia,writer_lvaro del Amo,writer_mile Gaudreault,writer_ric Rohmer,writer_va Grdos,recomendation
0,8000000.0,52287414.0,Stand by Me,89,8.1,299174,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
1,6000000.0,70136369.0,Ferris Bueller's Day Off,103,7.8,264740,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,15000000.0,179800601.0,Top Gun,110,6.9,236909,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,18500000.0,85160248.0,Aliens,137,8.4,540152,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,9000000.0,18564613.0,Flight of the Navigator,90,6.9,36636,1986,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [8]:
# Creating smaller DataFrane for provitional Database and Machine Learning Model testing
simple_df = movies_df.drop(movies_cat,1)
simple_df = simple_df[:100]
simple_df.head()

Unnamed: 0,budget,gross,name,runtime,score,votes,year
0,8000000.0,52287414.0,Stand by Me,89,8.1,299174,1986
1,6000000.0,70136369.0,Ferris Bueller's Day Off,103,7.8,264740,1986
2,15000000.0,179800601.0,Top Gun,110,6.9,236909,1986
3,18500000.0,85160248.0,Aliens,137,8.4,540152,1986
4,9000000.0,18564613.0,Flight of the Navigator,90,6.9,36636,1986


In [9]:
# Converting score to recomendation value for smaller dataframe
simple_df.loc[simple_df.score >= 7.0, "recomendation"] = 2
simple_df.loc[(simple_df.score < 7.0) & (simple_df.score > 5.0), "recomendation"] = 1
simple_df.loc[simple_df.score <= 5.0, "recomendation"] = 0
simple_df.head()

Unnamed: 0,budget,gross,name,runtime,score,votes,year,recomendation
0,8000000.0,52287414.0,Stand by Me,89,8.1,299174,1986,2.0
1,6000000.0,70136369.0,Ferris Bueller's Day Off,103,7.8,264740,1986,2.0
2,15000000.0,179800601.0,Top Gun,110,6.9,236909,1986,1.0
3,18500000.0,85160248.0,Aliens,137,8.4,540152,1986,2.0
4,9000000.0,18564613.0,Flight of the Navigator,90,6.9,36636,1986,1.0


# Load Dataset into PostgreSQL Database

In [10]:
# Create Database Engine and Load Dataset into Database
connection_string = f"postgres://postgres:{sql_pw}@group-c-project-db.csna2pebfhlh.us-east-2.rds.amazonaws.com:5432/postgres"
engine = create_engine(connection_string, pool_recycle=3600)

postgreSQLConnection = engine.connect()
postgreSQLTable = "Movie_Data";

try:
    frame = simple_df.to_sql(postgreSQLTable, postgreSQLConnection, if_exists='fail');
except ValueError as vx:
    print(vx)
except Exception as ex:  
    print(ex)
else:
    print("PostgreSQL Table %s has been created successfully."%postgreSQLTable);
finally:
    postgreSQLConnection.close();

PostgreSQL Table Movie_Data has been created successfully.
