**Importing Modules**

In [None]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.cluster.hierarchy import linkage, dendrogram

**Reading the dataset**

In [None]:
np.random.seed(5)                # with the seed reset the same set will appear everytime it is invoked else diferent set of inputs appear
movies_df = pd.read_csv('movies.csv')                                                               # Reading IMDb and Wikipedia movie data
print("Number of movies loaded: %s " % (len(movies_df)))
movies_df['plot'] = movies_df['wiki_plot'].astype(str) + "\n" + movies_df['imdb_plot'].astype(str)  # Combine wiki_plot and imdb_plot into a single column
movies_df.head()

**Performing Tokenization and Stemming**

In [None]:
def tokenize_and_stem(text):                                           # Define a function to perform both stemming and tokenization

    stemmer = SnowballStemmer("english")
    # Tokenize by sentence, then by word
    tokens = [y for text in nltk.sent_tokenize(text) for y in nltk.word_tokenize(text)]
    
    # Filter out raw tokens to remove noise
    filtered_tokens = [token for token in tokens if re.search('[a-zA-Z]', token)]
    print(filtered_tokens)
    # Stem the filtered_tokens
    stems = [stemmer.stem(word) for word in filtered_tokens]
    
    return stems

words_stemmed = tokenize_and_stem("Today (May 19, 2016) is his only daughter's wedding.")
print(words_stemmed)

**TF-IDF Vectorizer** 
1. Creating TfidfVectorizer recognizing words that are unique and that are important to any document
2. Fitting the text into the Tfidf and producing numeric form of data

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem,
                                 ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform([x for x in movies_df["plot"]])         #create a vector representation of the plot summaries
print(tfidf_matrix.shape)

**Importing KMeans and creating clusters**

In [None]:
km = KMeans(n_clusters=5)            # Create a KMeans object with 5 clusters and save as km
km.fit(tfidf_matrix)                 # Fit the k-means object with tfidf_matrix
clusters = km.labels_.tolist()
movies_df["cluster"] = clusters      # Create a column cluster to denote the generated cluster for each movie
movies_df['cluster'].value_counts()  # Display number of films per cluster (clusters from 0 to 4)

**Calculating similarity distance, merging the matrix and Plotting dendrogram**

In [None]:
similarity_distance = 1 - cosine_similarity(tfidf_matrix)     # Calculate the similarity distance
mergings = linkage(similarity_distance, method='complete')    # Create mergings matrix 
dendrogram_ = dendrogram(mergings,
               labels=[x for x in movies_df["title"]],
               leaf_rotation=90,
               leaf_font_size=16,
)                                                             # Plot the dendrogram, using title as label column
fig = plt.gcf()                                               # Adjust the plot
_ = [lbl.set_color('r') for lbl in plt.gca().get_xmajorticklabels()]
fig.set_size_inches(108, 21)
plt.show()