
Step 1: Load the data set


In [1]:
from google.colab import files
uploaded = files.upload()

Saving Movies_Dataset.csv to Movies_Dataset.csv


In [2]:
import pandas as pd
df = pd.read_csv("Movies_Dataset.csv")

In [3]:
df.head()

Unnamed: 0,id,title,overview
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,1,Jumanji,When siblings Judy and Peter discover an encha...
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...


Step 2: Explore the data

In [4]:
# Step 3: Data preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [5]:
documents = df['overview'].values.astype("U")

In [6]:
vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(documents)

In [7]:
k = 20
model = KMeans(n_clusters=k, init='k-means++', max_iter=100, n_init=1)
model.fit(features)

KMeans(max_iter=100, n_clusters=20, n_init=1)

In [8]:
df['cluster'] = model.labels_

In [9]:
df.head()

Unnamed: 0,id,title,overview,cluster
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",0
1,1,Jumanji,When siblings Judy and Peter discover an encha...,0
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...,5
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",13
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...,9


Step 4: Output the results

In [10]:
# output the result to a text file.

clusters = df.groupby('cluster')    

for cluster in clusters.groups:
    f = open('cluster'+str(cluster)+ '.csv', 'w') # create csv file
    data = clusters.get_group(cluster)[['title','overview']] # get title and overview columns
    f.write(data.to_csv(index_label='id')) # set index to id
    f.close()

Step 5: Evaluate the results

In [11]:
print("Cluster centroids: \n")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(k):
    print("Cluster %d:" % i)
    for j in order_centroids[i, :10]: #print out 10 feature terms of each cluster
        print (' %s' % terms[j])
    print('------------')

Cluster centroids: 

Cluster 0:
 young
 love
 story
 world
 group
 girl
 time
 friends
 father
 people
------------
Cluster 1:
 overview
 available
 movie
 plot
 freaks
 freakishly
 freakish
 freaking
 freaked
 freak
------------
Cluster 2:
 team
 earth
 planet
 alien
 space
 aliens
 world
 crew
 mission
 human
------------
Cluster 3:
 agent
 king
 secret
 fbi
 cia
 evil
 queen
 prince
 service
 undercover
------------
Cluster 4:
 old
 year
 boy
 father
 mother
 girl
 life
 new
 years
 daughter
------------
Cluster 5:
 family
 father
 home
 life
 mother
 son
 young
 house
 new
 brother
------------
Cluster 6:
 money
 make
 father
 job
 life
 pay
 young
 man
 wants
 bank
------------
Cluster 7:
 documentary
 film
 feature
 interviews
 world
 footage
 filmmaker
 life
 american
 length
------------
Cluster 8:
 finds
 young
 life
 man
 love
 soon
 girl
 new
 father
 home
------------
Cluster 9:
 wife
 husband
 man
 son
 daughter
 life
 ex
 home
 young
 love
------------
Cluster 10:
 town
 

