In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Movies_Dataset.csv')
df

Unnamed: 0,id,title,overview
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,1,Jumanji,When siblings Judy and Peter discover an encha...
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom..."
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...
...,...,...,...
45461,45461,Subdue,Rising and falling between a man and woman.
45462,45462,Century of Birthing,An artist struggles to finish his work while a...
45463,45463,Betrayal,"When one of her hits goes wrong, a professiona..."
45464,45464,Satan Triumphant,"In a small town live two brothers, one a minis..."


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

documents = df['overview'].values.astype("U")
vectorizer = TfidfVectorizer( stop_words='english')
features = vectorizer.fit_transform(documents)

features 

<45466x74567 sparse matrix of type '<class 'numpy.float64'>'
	with 1212061 stored elements in Compressed Sparse Row format>

In [4]:
print(features)


  (0, 38029)	0.10739705953465474
  (0, 73094)	0.4809827114790238
  (0, 3554)	0.4117836571172595
  (0, 67545)	0.14878284660693247
  (0, 38963)	0.0871868917895906
  (0, 28809)	0.13311522181618415
  (0, 56644)	0.11124851086523602
  (0, 7838)	0.12380553184830105
  (0, 9430)	0.10635375129287979
  (0, 10204)	0.502803868613561
  (0, 38641)	0.2062792468281062
  (0, 58299)	0.11355918868736861
  (0, 2261)	0.140911774178889
  (0, 39390)	0.11907123344715954
  (0, 50721)	0.09190797940163037
  (0, 29316)	0.10093917370354447
  (0, 50915)	0.1343481728311918
  (0, 12774)	0.12544427954397822
  (0, 59243)	0.1300801610445509
  (0, 48382)	0.10339358185033236
  (0, 19887)	0.13281884272823927
  (0, 22086)	0.10438761058719499
  (0, 37970)	0.10142919482788752
  (0, 4762)	0.14748820342184052
  (0, 18010)	0.1348314953863925
  :	:
  (45464, 63541)	0.1233823724233346
  (45464, 20136)	0.13613037415656817
  (45464, 15665)	0.14870121148921367
  (45464, 20467)	0.16775127520702973
  (45465, 73642)	0.10753480162665091
 

In [5]:
k = 40
model = KMeans(n_clusters = k, init = 'k-means++',max_iter=100, n_init=1)
model.fit(features)

In [6]:
df['cluster'] = model.labels_ 
df.head()

Unnamed: 0,id,title,overview,cluster
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",8
1,1,Jumanji,When siblings Judy and Peter discover an encha...,8
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...,39
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",12
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...,26


In [7]:
df

Unnamed: 0,id,title,overview,cluster
0,0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",8
1,1,Jumanji,When siblings Judy and Peter discover an encha...,8
2,2,Grumpier Old Men,A family wedding reignites the ancient feud be...,39
3,3,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",12
4,4,Father of the Bride Part II,Just when George Banks has recovered from his ...,26
...,...,...,...,...
45461,45461,Subdue,Rising and falling between a man and woman.,11
45462,45462,Century of Birthing,An artist struggles to finish his work while a...,8
45463,45463,Betrayal,"When one of her hits goes wrong, a professiona...",8
45464,45464,Satan Triumphant,"In a small town live two brothers, one a minis...",3


In [8]:
clusters = df.groupby('cluster')    

for cluster in clusters.groups:
    f = open('cluster'+str(cluster)+ '.csv', 'w') # create csv file
    data = clusters.get_group(cluster)[['title','overview']] # get title and overview columns
    f.write(data.to_csv(index_label='id')) # set index to id
    f.close()

In [9]:
print("Cluster centroids: \n")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()

Cluster centroids: 



In [10]:
for i in range(k):
    print("Cluster %d:" % i)
    for j in order_centroids[i, :10]: #print out 10 feature terms of each cluster
        print (' %s' % terms[j])
    print('------------')

Cluster 0:
 year
 old
 boy
 father
 life
 girl
 family
 new
 son
 mother
------------
Cluster 1:
 men
 young
 women
 lives
 story
 life
 woman
 love
 world
 different
------------
Cluster 2:
 love
 falls
 story
 young
 fall
 woman
 life
 man
 meets
 true
------------
Cluster 3:
 town
 small
 sheriff
 local
 new
 life
 people
 young
 family
 home
------------
Cluster 4:
 just
 band
 rock
 music
 time
 concert
 new
 like
 life
 world
------------
Cluster 5:
 overview
 available
 movie
 plot
 freaks
 freakishly
 freakish
 freaking
 freaked
 freak
------------
Cluster 6:
 nan
 ݣ1890
 frazier
 fraw
 fray
 frayed
 fraying
 frayn
 frazer
 frazzled
------------
Cluster 7:
 united
 states
 president
 war
 american
 world
 film
 years
 life
 america
------------
Cluster 8:
 world
 young
 story
 wife
 time
 people
 new
 finds
 takes
 help
------------
Cluster 9:
 college
 student
 students
 professor
 friends
 life
 campus
 new
 group
 young
------------
Cluster 10:
 past
 life
 present
 future
 