In [5]:
# Import dependencies
import pandas as pd
import plotly.express as px
import hvplot.pandas
from sklearn.cluster import KMeans
import boto3
from sqlalchemy import create_engine
from io import StringIO
from config import aws_id, aws_secret, sql_pwd

In [49]:
# Create Database Engine and extract merged_Final_allDatasets_left_df from Database
connection_string = f"postgres://postgres:{sql_pwd}@group-c-project-db.csna2pebfhlh.us-east-2.rds.amazonaws.com:5432/postgres"
engine = create_engine(connection_string)
movies_all_df = pd.read_sql(sql="Final_Movies_dataset", con=engine)
movies_all_df.head()

Unnamed: 0,index,budget_ds-movies,company,country,director,genre,rating,score,star,writer,...,popularity,revenue,title,vote_count,Ave_runtime,Netflix,Hulu,Prime Video,Disney+,keywords_name
0,0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,R,8.1,Wil Wheaton,Stephen King,...,51.109279,52287414,Stand by Me,1522,89.0,0.0,0.0,0.0,0.0,based on novel friendship coming of age railro...
1,1,15000000.0,Paramount Pictures,USA,Tony Scott,Action,PG,6.9,Tom Cruise,Jim Cash,...,58.900647,356830601,Top Gun,1698,110.0,0.0,0.0,0.0,0.0,lovesickness loss of lover fighter pilot self-...
2,2,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,R,8.4,Sigourney Weaver,James Cameron,...,67.66094,183316455,Aliens,3220,137.0,0.0,0.0,0.0,0.0,android extraterrestrial technology space mari...
3,3,6000000.0,Other,UK,Oliver Stone,Drama,R,8.1,Charlie Sheen,Oliver Stone,...,49.802914,138530565,Platoon,1205,120.0,0.0,0.0,0.0,0.0,famous score hero mine vietnam war village gor...
4,4,8800000.0,Other,Australia,Peter Faiman,Adventure,PG-13,6.5,Paul Hogan,Ken Shadie,...,16.072466,328203506,Crocodile Dundee,503,97.0,0.0,0.0,0.0,0.0,new york prostitute hotel journalist culture c...


In [68]:
# Import encoded
path = "Final_Datasets/Final_Movies_dataset_Encoded.csv"
movies_all_df = pd.read_csv(path)

In [69]:
movies_all_df.columns.tolist()

['budget_ds-movies',
 'company',
 'country',
 'director',
 'genre',
 'rating',
 'score',
 'star',
 'writer',
 'year',
 'keywords',
 'original_language',
 'overview',
 'popularity',
 'revenue',
 'title',
 'vote_count',
 'Ave_runtime',
 'Netflix',
 'Hulu',
 'Prime Video',
 'Disney+',
 'keywords_name',
 'company_40 Acres & A Mule Filmworks',
 'company_Alcon Entertainment',
 'company_Alliance Atlantis Communications',
 'company_Artisan Entertainment',
 'company_BBC Films',
 'company_Beacon Communications',
 'company_British Broadcasting Corporation (BBC)',
 'company_CBS Films',
 'company_Caravan Pictures',
 'company_Carolco Pictures',
 'company_Castle Rock Entertainment',
 'company_Columbia Pictures',
 'company_Columbia Pictures Corporation',
 'company_Constantin Film',
 'company_Davis Entertainment',
 'company_Dimension Films',
 'company_DreamWorks',
 'company_DreamWorks Animation',
 'company_Endgame Entertainment',
 'company_Eon Productions',
 'company_EuropaCorp',
 'company_FilmDistrict

In [70]:
# Drop cathegorical columns
movies_df = movies_all_df.drop(columns = ['company', 'country', 'director','genre','rating','star', 'writer', 'year', 'keywords', 'original_language','overview','title', 'keywords_name','budget_ds-movies','revenue'])

## KMeans Model

In [71]:
# Store Values of K to Plot
inertia = []
k = list(range(1, 11))

In [72]:
# Looking for the best K
for i in k:
    km = KMeans(n_clusters=i, random_state=0)
    km.fit(movies_df)
    inertia.append(km.inertia_)

In [73]:
# Define a DataFrame to plot the Elbow Curve using hvPlot
elbow_data = {"k": k, "inertia": inertia}
df_elbow = pd.DataFrame(elbow_data)
df_elbow.hvplot.line(x="k", y="inertia", title="Elbow Curve", xticks=k)

In [74]:
# Looking the elbow curve shows the model with 3 clusters
model = KMeans(n_clusters=3, random_state=5)
model

KMeans(n_clusters=3, random_state=5)

In [75]:
# Fitting model
model.fit(movies_df)

KMeans(n_clusters=3, random_state=5)

In [76]:
# Get predictions
predictions = model.predict(movies_df)
print(predictions)

[1 1 1 ... 0 0 0]


In [77]:
# Add a new class column to the df_iris
movies_df["class"] = model.labels_
class_df = pd.DataFrame(model.labels_, columns=["class"])
movies_df.head()
class_df

Unnamed: 0,class
0,1
1,1
2,1
3,0
4,0
...,...
3174,0
3175,0
3176,0
3177,0


In [78]:
movies_df.columns.tolist()

['score',
 'popularity',
 'vote_count',
 'Ave_runtime',
 'Netflix',
 'Hulu',
 'Prime Video',
 'Disney+',
 'company_40 Acres & A Mule Filmworks',
 'company_Alcon Entertainment',
 'company_Alliance Atlantis Communications',
 'company_Artisan Entertainment',
 'company_BBC Films',
 'company_Beacon Communications',
 'company_British Broadcasting Corporation (BBC)',
 'company_CBS Films',
 'company_Caravan Pictures',
 'company_Carolco Pictures',
 'company_Castle Rock Entertainment',
 'company_Columbia Pictures',
 'company_Columbia Pictures Corporation',
 'company_Constantin Film',
 'company_Davis Entertainment',
 'company_Dimension Films',
 'company_DreamWorks',
 'company_DreamWorks Animation',
 'company_Endgame Entertainment',
 'company_Eon Productions',
 'company_EuropaCorp',
 'company_FilmDistrict',
 'company_Focus Features',
 'company_Fox 2000 Pictures',
 'company_Fox Atomic',
 'company_Fox Searchlight Pictures',
 'company_Franchise Pictures',
 'company_Gary Sanchez Productions',
 'compan

## Plotting Score vs Revenue

In [20]:
# Plotting the clusters with two features
movies_df.hvplot.scatter(x="score", y="revenue", by="class")

In [22]:
# Plotting the clusters with two features
movies_df.hvplot.scatter(x="revenue", y="popularity", by="class")

In [79]:
# Plotting the clusters with two features
movies_df.hvplot.scatter(x="score", y="Ave_runtime", by="class")

In [80]:
# Plotting the clusters with two features
movies_df.hvplot.scatter(x="score", y="vote_count", by="class")

In [28]:
# Plotting the clusters with two features
movies_df.hvplot.scatter(x="vote_count", y="revenue", by="class")

In [81]:
# Plotting the clusters with three features
fig = px.scatter_3d(movies_df, x="score", y="vote_count", 
                    z="popularity", color="class", symbol="class", 
                    width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [82]:
# Plotting the clusters with three features
fig = px.scatter_3d(movies_df, x="score", y="popularity", 
                    z="revenue", color="class", symbol="class", 
                    width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

ValueError: Value of 'z' is not the name of a column in 'data_frame'. Expected one of ['score', 'popularity', 'vote_count', 'Ave_runtime', 'Netflix', 'Hulu', 'Prime Video', 'Disney+', 'company_40 Acres & A Mule Filmworks', 'company_Alcon Entertainment', 'company_Alliance Atlantis Communications', 'company_Artisan Entertainment', 'company_BBC Films', 'company_Beacon Communications', 'company_British Broadcasting Corporation (BBC)', 'company_CBS Films', 'company_Caravan Pictures', 'company_Carolco Pictures', 'company_Castle Rock Entertainment', 'company_Columbia Pictures', 'company_Columbia Pictures Corporation', 'company_Constantin Film', 'company_Davis Entertainment', 'company_Dimension Films', 'company_DreamWorks', 'company_DreamWorks Animation', 'company_Endgame Entertainment', 'company_Eon Productions', 'company_EuropaCorp', 'company_FilmDistrict', 'company_Focus Features', 'company_Fox 2000 Pictures', 'company_Fox Atomic', 'company_Fox Searchlight Pictures', 'company_Franchise Pictures', 'company_Gary Sanchez Productions', 'company_Golden Harvest Company', 'company_Hollywood Pictures', 'company_Icon Entertainment International', 'company_Imagine Entertainment', 'company_Lakeshore Entertainment', 'company_Lions Gate Films', 'company_Lionsgate', 'company_Marvel Studios', 'company_Metro-Goldwyn-Mayer (MGM)', 'company_Millennium Films', 'company_Miramax', 'company_Morgan Creek Productions', 'company_New Line Cinema', 'company_Orion Pictures', 'company_Other', 'company_Paramount Pictures', 'company_Paramount Vantage', 'company_Path', 'company_Pixar Animation Studios', 'company_Polygram Filmed Entertainment', 'company_Recorded Picture Company (RPC)', 'company_Regency Enterprises', 'company_Relativity Media', 'company_Revolution Studios', 'company_Rogue Pictures', 'company_Screen Gems', 'company_Sony Pictures Classics', 'company_StudioCanal', 'company_Summit Entertainment', 'company_The Weinstein Company', 'company_Touchstone Pictures', 'company_TriStar Pictures', 'company_Twentieth Century Fox Animation', 'company_Twentieth Century Fox Film Corporation', 'company_Twisted Pictures', 'company_United Artists', 'company_Universal Pictures', 'company_Walt Disney Animation Studios', 'company_Walt Disney Pictures', 'company_Warner Bros.', 'company_Warner Independent Pictures (WIP)', 'company_Working Title Films', 'company_Worldview Entertainment', 'country_Argentina', 'country_Aruba', 'country_Australia', 'country_Austria', 'country_Belgium', 'country_Brazil', 'country_Canada', 'country_China', 'country_Colombia', 'country_Czech Republic', 'country_Denmark', 'country_Finland', 'country_France', 'country_Germany', 'country_Greece', 'country_Hong Kong', 'country_Hungary', 'country_India', 'country_Iran', 'country_Ireland', 'country_Israel', 'country_Italy', 'country_Japan', 'country_Mexico', 'country_Netherlands', 'country_New Zealand', 'country_Norway', 'country_Peru', 'country_Poland', 'country_Romania', 'country_Russia', 'country_South Africa', 'country_South Korea', 'country_Spain', 'country_Sweden', 'country_Taiwan', 'country_Thailand', 'country_UK', 'country_USA', 'genre_Action', 'genre_Adventure', 'genre_Animation', 'genre_Biography', 'genre_Comedy', 'genre_Crime', 'genre_Drama', 'genre_Family', 'genre_Fantasy', 'genre_Horror', 'genre_Mystery', 'genre_Romance', 'genre_Sci-Fi', 'genre_Thriller', 'genre_Western', 'rating_G', 'rating_NC-17', 'rating_NOT RATED', 'rating_Not specified', 'rating_PG', 'rating_PG-13', 'rating_R', 'rating_UNRATED', 'original_language_af', 'original_language_cn', 'original_language_da', 'original_language_de', 'original_language_el', 'original_language_en', 'original_language_es', 'original_language_fa', 'original_language_fr', 'original_language_he', 'original_language_hi', 'original_language_it', 'original_language_ja', 'original_language_ko', 'original_language_nl', 'original_language_no', 'original_language_pl', 'original_language_pt', 'original_language_ro', 'original_language_ru', 'original_language_sv', 'original_language_te', 'original_language_th', 'original_language_zh', 'class'] but received: revenue

## Merging class to movies_all_df

In [83]:
# Merge class with movies_all_df
movies_final_df = pd.merge(movies_all_df, class_df, how='outer', left_index=True, right_index=True)
movies_final_df.head()

Unnamed: 0,budget_ds-movies,company,country,director,genre,rating,score,star,writer,year,...,original_language_no,original_language_pl,original_language_pt,original_language_ro,original_language_ru,original_language_sv,original_language_te,original_language_th,original_language_zh,class
0,8000000.0,Columbia Pictures Corporation,USA,Rob Reiner,Adventure,R,8.1,Wil Wheaton,Stephen King,1986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,15000000.0,Paramount Pictures,USA,Tony Scott,Action,PG,6.9,Tom Cruise,Jim Cash,1986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,18500000.0,Twentieth Century Fox Film Corporation,USA,James Cameron,Action,R,8.4,Sigourney Weaver,James Cameron,1986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,6000000.0,Other,UK,Oliver Stone,Drama,R,8.1,Charlie Sheen,Oliver Stone,1986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,8800000.0,Other,Australia,Peter Faiman,Adventure,PG-13,6.5,Paul Hogan,Ken Shadie,1986,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


## Plotting Score vs Genre and Class

In [84]:
# Plotting the clusters with two features
movies_final_df.hvplot.scatter(x="original_language", y="score", by="class")

In [85]:
# Plotting the clusters with three features
fig = px.scatter_3d(movies_final_df, x="score", y="original_language", 
                    z="vote_count", color="class", symbol="class", 
                    width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()

In [88]:
# Plotting the clusters with three features
fig = px.scatter_3d(movies_final_df, x="revenue", y="popularity", 
                    z="vote_count", color="class", symbol="class", 
                    width=800)
fig.update_layout(legend=dict(x=0,y=1))
fig.show()