In [None]:
pip install pdfplumber

In [None]:
import pandas as pd
import numpy as np
import os
import pdfplumber
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
import matplotlib.pyplot as plt
import difflib
import seaborn as sn

In [None]:
folder_path = "Enter path over here"

col_names=['name','page','data']
df = pd.DataFrame(columns=col_names)

#Opening and extracting data from all the files
for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)

    if os.path.isfile(file_path) and filename.lower().endswith(".pdf"):
        with pdfplumber.open(file_path) as pdf:
            for page in pdf.pages:
                text = page.extract_text()
                text_edit = text.split()
                text_final = ' '.join(text_edit)
                df.loc[len(df.index)] = [filename,page.page_number,text_final]
df

In [None]:
#Cleaning the data
df = df.drop(columns=['page'],axis=1)
df['data'] = df.groupby(['name'])['data'].transform(lambda x : ' '.join(x))
df = df.drop_duplicates()
df = df.reset_index(drop=True)
df

In [None]:
array_shape = len(df.index)
array_shape

In [None]:
#Delcaring matrix to generate correlation matrix
sim_index = np.zeros([array_shape,array_shape],dtype=float)
print(sim_index)

In [None]:
#Function to find correlation/similarity
def similar(input_string, reference_string):
    diff = difflib.ndiff(input_string, reference_string)
    diff_count = 0
    for line in diff:
        if line.startswith("-"):
            diff_count += 1
    return np.round(1 - (diff_count / len(input_string)),3)

In [None]:
#adding values to the matrix
for index1, row1 in df.iterrows():
    for index2, row2 in df.iterrows():
        sim_value = similar(row1['data'],row2['data'])
        sim_index[index1][index2] = sim_value

sim_index

In [None]:
hm = sn.heatmap(data = sim_index) 
plt.show()

In [None]:
#Using Heirarchical Clustering
distance_matrix = np.sqrt(2 * (1 - sim_index))
#linkage_matrix shows the clusters getting formed
linkage_matrix = linkage(distance_matrix, method='ward')
print(linkage_matrix)
dendrogram(linkage_matrix)
plt.show()

inertias = []

for n_clusters in range(1, 11): 
    cluster_labels = fcluster(linkage_matrix, t=n_clusters, criterion='maxclust')
    inertia = 0
    
    for cluster_num in range(1, n_clusters + 1):
        cluster_points = distance_matrix[cluster_labels == cluster_num]
        
        if len(cluster_points) > 0:
            #inertia sum of squared distance of samples to their closest cluster center
            inertia += np.sum((cluster_points - cluster_points.mean())**2)
    
    inertias.append(inertia)

print(inertias)
plt.plot(range(1, 11), inertias, marker='o')
plt.xlabel('Number of Clusters')
plt.ylabel('Inertia')
plt.title('Elbow Method for Hierarchical Clustering')
plt.show()

In [None]:
#Finding percent change in inertias
percent_change = []

for i in range(1, len(inertias)):
    previous_value = inertias[i - 1]
    current_value = inertias[i]
    value = ((current_value - previous_value) / previous_value) * 100
    percent_change.append(value)

print(percent_change)

In [None]:
#Find min value of percent change
percent_change = [abs(x) for x in percent_change]
percent = [abs(x) for x in percent_change if x != percent_change[0]]
min_value = min(percent)
min_value

In [None]:
#Defining number of clusters (from elbow method)
desired_num_clusters = percent_change.index(min_value)+1

# Perform hierarchical clustering
clusters = fcluster(linkage_matrix, t=desired_num_clusters, criterion='maxclust')

# Create a dictionary to store elements in each cluster
cluster_elements = {}

# Assign data points to clusters
for idx, cluster_id in enumerate(clusters):
    if cluster_id not in cluster_elements:
        cluster_elements[cluster_id] = []
    cluster_elements[cluster_id].append(idx)

# Print elements in each cluster
for cluster_id, elements in cluster_elements.items():
    print(f"Cluster {cluster_id}: {elements}")

In [None]:
#Declaring column names for the clustered df
col_names = []
for i in range(desired_num_clusters):
    col = (f"Cluster {i+1}")
    col_names.append(col)
col_names

In [None]:
cluster_df = pd.DataFrame(columns=col_names)

#Adding names to their respective clusters
for cluster_id, elements in cluster_elements.items():
    for element_index in elements:
        name = df.loc[element_index, 'name']
        cluster_df.loc[element_index, f'Cluster {cluster_id}'] = name
        
cluster_df.fillna('', inplace=True)
cluster_df