In [59]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [149]:
data = pd.read_csv("data_with_clusters_bert_08.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,title,abstract,processed_abstract,clusters
0,0,Investigating the Energy Production through Su...,Artificial Intelligence (AI) has the potential...,potential predict generation solar wind source...,5
1,1,Sustainability Is Not Enough: Towards AI Suppo...,"Sustainability, to describe it in simpler word...",describe simpler words bad limits destruction ...,4
2,2,AI Legitimacy for Sustainability,Sustainability is a vast subject involving var...,vast subject involving research possibilities ...,7
3,3,Keynote: AI for Scientific Discovery and a Sus...,Artificial Intelligence (AI) is a rapidly adva...,advancing reasoning reach new milestones world...,1
4,4,Explainable AI for predicting daily household ...,"In the recent era, for most sustainable smart ...",point consideration urbanization carried expon...,1


In [151]:
data.loc[data['clusters'] == 0].shape

(33, 5)

In [93]:
# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Create a dictionary to store important words for each cluster
important_words_per_cluster = {}

# Iterate through each cluster
for cluster_idx in data['clusters'].unique():
    cluster_data = data[data['clusters'] == cluster_idx]['processed_abstract']
    
    # Fit and transform the TF-IDF matrix for the current cluster
    tfidf_matrix = tfidf_vectorizer.fit_transform(cluster_data)
    
    # Get feature names (words)
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    # Calculate the sum TF-IDF scores for each word in the cluster
    sum_tfidf_scores = tfidf_matrix.sum(axis=0)
    
    # Convert the sum TF-IDF scores to a flat list
    sum_tfidf_scores_list = sum_tfidf_scores.A1
    
    # Sort indices by sum TF-IDF scores (higher scores first)
    sorted_indices = sorted(range(len(sum_tfidf_scores_list)), key=lambda i: sum_tfidf_scores_list[i], reverse=True)
    
    # Extract the top N important words (e.g., top 10 words)
    top_n_words = [feature_names[i] for i in sorted_indices[:4]]
    
    important_words_per_cluster[cluster_idx] = top_n_words

# Print the important words for each cluster
for cluster_idx, words in important_words_per_cluster.items():
    print(f"Cluster {cluster_idx}: {', '.join(words)}")


Cluster 4: power, smart, solar, consumption
Cluster 0: data, learning, development, load
Cluster 1: systems, system, smart, learning
Cluster 3: smart, power, data, system
Cluster 7: iot, data, water, system
Cluster 5: smart, data, grid, consumption
Cluster -1: control, smart, wind, new
Cluster 6: power, data, smart, learning
Cluster 8: iot, consumption, learning, design
Cluster 2: solar, system, power, smart


In [127]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
# Group by cluster
grouped = data.groupby('clusters')

# Initialize CountVectorizer
vectorizer = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)

for cluster, group in grouped:
    texts = group['processed_abstract']
    # Fit and transform the texts using CountVectorizer
    X = vectorizer.fit_transform(texts)
    
    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()
    
    # Sum the occurrences of each word
    word_counts = X.sum(axis=0)
    
    # Create a DataFrame to hold word frequencies
    word_freq_df = pd.DataFrame({'word': feature_names, 'frequency': word_counts.tolist()[0]})
    
    # Sort by frequency in descending order and get top 10 words
    top_words = word_freq_df.sort_values(by='frequency', ascending=False).head(30)
    
    print(f"Cluster {cluster}:")
    print(top_words)
    print("\n")

Cluster -1:
             word  frequency
71           data         25
45         cities         13
276         smart         13
319          wind         12
62        control          8
34            big          8
316        values          7
154           iot          7
20   architecture          7
221    prediction          6
86    electricity          6
219         power          6
27          axies          6
228          prim          5
133         human          5
99        extreme          5
204        panels          5
41     challenges          4
134        impact          4
225       present          4
32       benefits          4
166         local          4
297    techniques          4
172   measurement          4
88            end          4
56     congestion          4
108   forecasting          4
25      automated          4
183        models          4
84           edge          4


Cluster 0:
             word  frequency
358      learning         21
346           iot 

In [162]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
corpus = data.loc[data['clusters']==]['processed_abstract']

def get_tfidf_top_features(documents,n_top=10):
      tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,  stop_words='english')
      tfidf = tfidf_vectorizer.fit_transform(documents)
      importance = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[::-1]
      tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names())
      return tfidf_feature_names[importance[:n_top]]
    
get_tfidf_top_features(corpus)   

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [130]:
data.loc[data['clusters']==1].head(30)


Unnamed: 0.1,Unnamed: 0,title,abstract,processed_abstract,clusters
18,18,Intelligent analysis and design of heat pump b...,Addressing concerns about security of energy s...,addressing concerns climate drives shift green...,1
27,27,Renewable Energy Systems Energy Modeling using...,Communities using Sustainable Energy Systems (...,using grid locals data precise learning used p...,1
29,29,IoT based Framework for Smart Campus: COVID-19...,Internet of Things (IoT) plays an important ro...,internet iot connecting everything together in...,1
30,30,A Detailed Investigation on Potential Impact o...,Quantum computing is an emerging technology (Q...,quantum computing emerging technology qc quant...,1
44,44,TinyML Smart Sensor for Energy Saving in Inter...,Smart agriculture researchers bring numerous t...,agriculture bring prospects farm productivity ...,1
59,59,Optimizing Computational Resources for Edge In...,As the number of interconnected devices increa...,devices iot start flourish cost computational ...,1
70,70,Flexibility platform for community energy systems,Integrating technological changes and sustaina...,integrating considerations poses multidiscipli...,1
74,74,Energy-efficient and Sustainable Construction ...,As a key part of sustainable urban development...,key urban technologies materials key minimize ...,1
76,76,Digital revolution in efficient self-organizat...,Artificial intelligence highlights in several ...,remarkable sublimate building sector push cons...,1
88,88,Machine Learning Based Problem Solving Approac...,The issues related to conventional generation ...,issues conventional generation arethe matter d...,1


In [152]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

df = pd.DataFrame(data)

# Initialize TF-IDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the text data
X = vectorizer.fit_transform(df['processed_abstract'])

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame to hold TF-IDF values
tfidf_df = pd.DataFrame(X.toarray(), columns=feature_names)

# Add the 'clusters' column to the TF-IDF DataFrame
tfidf_df['clusters'] = df['clusters']

# Group by cluster
grouped = tfidf_df.groupby('clusters')

# Calculate the mean TF-IDF values for each cluster
cluster_tfidf_means = grouped.mean()

# Calculate the difference in TF-IDF means between the current cluster and other clusters
cluster_tfidf_diffs = cluster_tfidf_means.sub(cluster_tfidf_means.mean(axis=0), axis=1)

# Get the top words for each cluster based on the absolute differences in TF-IDF means
num_top_words = 10
top_words_per_cluster = {}
for cluster in cluster_tfidf_diffs.index:
    top_indices = cluster_tfidf_diffs.loc[cluster].abs().values.argsort()[-num_top_words:][::-1]
    top_words_per_cluster[cluster] = [feature_names[i] for i in top_indices]

# Print the top words for each cluster
for cluster, top_words in top_words_per_cluster.items():
    print(f"Cluster {cluster}:")
    print(top_words)
    print("\n")


Cluster -1:
['potentials', 'watchplant', 'fixing', 'mexico', 'soilless', 'environmentally', 'resourceful', 'usable', 'mfc', 'words']


Cluster 0:
['potentials', 'soilless', 'nhs', 'thing', 'computers', 'factories', 'learners', 'tiny', 'consumes', 'digit']


Cluster 1:
['dashboards', 'watchplant', 'small', 'grey', 'neutrality', 'potentials', 'technique', 'deduce', 'hifs', 'wildlife']


Cluster 2:
['potentials', 'wastage', 'distribute', 'propose', 'ecuadorian', 'economy', 'gbr', 'biology', 'multiparametric', 'fixing']


Cluster 3:
['producing', 'soilless', 'schedules', 'delm', 'manufacturers', 'small', 'watchplant', 'sensitizers', 'consumes', 'regulatory']


Cluster 4:
['iomt', 'detecting', 'propose', 'managed', 'quantization', 'neumann', 'uses', 'counterparts', 'fixing', 'watchplant']


Cluster 5:
['learners', 'small', 'consumes', 'neumann', 'watchplant', 'lln', 'dqn', 'forecast', 'pakistan', 'tilt']


Cluster 6:
['technologies', 'logistic', 'watchplant', 'theoretically', 'user', 'kinds