In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

# Assuming you have a DataFrame data with columns 'Extracted_keyword' and 'field_of_study'
# Example: data = pd.read_csv('your_file.csv')
input_file = "data_chula.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)
# Prepare the TfidfVectorizer with ngram_range to capture bigrams or trigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 2))  # (1, 2) captures unigrams and bigrams

# Apply KMeans clustering for each field of study
field_of_study_list = data["subjectArea"].unique()  # List of unique fields of study

# Create a dictionary to store the clustering results by field of study
field_clustering_results = {}

for field in field_of_study_list:
    # Filter the data for the current field of study
    field_data = data[data["subjectArea"] == field]
    
    # Prepare the text data for TF-IDF
    sentences = field_data["extracted_keywords"]
    
    # Compute the TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Apply KMeans clustering (choose the number of clusters)
    num_clusters = 5  # You can modify this depending on your data
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)
    
    # Add the cluster labels to the DataFrame for the current field of study
    field_data['cluster'] = kmeans.labels_
    
    # Identify the largest cluster by size
    largest_cluster = np.argmax(np.bincount(kmeans.labels_))
    
    # Get the indices of the largest cluster
    largest_cluster_indices = np.where(kmeans.labels_ == largest_cluster)[0]
    
    # Find the centroid of the largest cluster
    centroid = kmeans.cluster_centers_[largest_cluster]
    
    # Get the top keywords of the centroid (sorted by their importance)
    top_keywords_indices = centroid.argsort()[-10:][::-1]  # Top 10 keywords
    top_keywords = [vectorizer.get_feature_names_out()[i] for i in top_keywords_indices]
    
    # Store the results in the dictionary
    field_clustering_results[field] = {
        'top_keywords': top_keywords,
        'field_data': field_data,
        'largest_cluster': largest_cluster,
        'cluster_centroids': kmeans.cluster_centers_
    }

# Print the top keywords for each field of study
for field, results in field_clustering_results.items():
    print(f"Field of Study: {field}")
    print(f"Top Keywords: {results['top_keywords']}")
    print()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

Field of Study: MATE
Top Keywords: ['ni', 'tio2', 'pla', 'graphene', 'cellulose', 'zno', 'films', 'geopolymer', 'nr', 'concrete']

Field of Study: BUSI
Top Keywords: ['innovation', 'board', 'takeover', 'brand', 'csr', 'corporate', 'smes', 'commerce', 'financial', 'earnings']

Field of Study: HEAL
Top Keywords: ['pharmacists', 'balance', 'training', 'foot', 'exercise', 'foot diabetic', 'diabetic', 'turnover', 'srp', 'arch']

Field of Study: CHEM
Top Keywords: ['nr', 'co2', 'films', 'cu2', 'pla', 'tio2', 'bc', 'cmc', 'ni', 'cd']

Field of Study: MEDI
Top Keywords: ['hiv', 'aki', 'liver', 'covid', 'pd', 'cancer', 'kidney', 'sleep', 'health', 'hpv']

Field of Study: MULT
Top Keywords: ['covid', 'ethanol', 'pdl', '19', 'covid 19', 'tlc', 'flour', 'slag', 'banana', 'geopolymer']

Field of Study: PHYS
Top Keywords: ['proton', 'galaxies', 'jet', 'tev', 'theories', 'jets', 'dark', 'alma', 'squark', 'neutrino']

Field of Study: NEUR
Top Keywords: ['schizophrenia', 'mdd', 'pd', 'stroke', 'igm', '

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np

# Load data
input_file = "data_chula.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

# Prepare the TfidfVectorizer with ngram_range to capture only unigrams (single words)
vectorizer = TfidfVectorizer(ngram_range=(1, 1))  # Ensure only single words are captured

# Get the list of unique fields of study
field_of_study_list = data["subjectArea"].unique()

# List to store the results
results = []

for field in field_of_study_list:
    # Filter data for the current field of study
    field_data = data[data["subjectArea"] == field]
    
    # Extract the text for TF-IDF
    sentences = field_data["extracted_keywords"]
    
    # Compute the TF-IDF matrix
    tfidf_matrix = vectorizer.fit_transform(sentences)
    
    # Apply KMeans clustering
    num_clusters = 5  # Define the number of clusters
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)
    
    # Assign cluster labels to the DataFrame
    field_data['cluster'] = kmeans.labels_
    
    # Identify the largest cluster by size
    largest_cluster = np.argmax(np.bincount(kmeans.labels_))
    
    # Find the centroid of the largest cluster
    centroid = kmeans.cluster_centers_[largest_cluster]
    
    # Identify the single top keyword from the centroid
    top_keyword_index = centroid.argmax()  # Index of the highest value in the centroid
    top_keyword = vectorizer.get_feature_names_out()[top_keyword_index]  # Get the keyword
    
    # Append the field and the top keyword to the results
    results.append({"field_of_study": field, "top_keyword": top_keyword})

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
output_file = "top_keywords_by_field.csv"
results_df.to_csv(output_file, index=False)

print(f"Results saved to {output_file}")


Results saved to top_keywords_by_field.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  field_data['cluster'] = kmeans.labels_
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_in

In [8]:
results_df

Unnamed: 0,field_of_study,top_keyword
0,MATE,ni
1,BUSI,innovation
2,HEAL,pharmacists
3,CHEM,cu2
4,MEDI,hiv
5,MULT,covid
6,PHYS,proton
7,NEUR,mdd
8,CENG,pt
9,ENGI,steel


In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
corpus = [
    'donation , organ',
]
X = vectorizer.fit_transform(corpus)
for i in X:
  print(i)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2 stored elements and shape (1, 2)>
  Coords	Values
  (0, 0)	0.7071067811865475
  (0, 1)	0.7071067811865475


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

input_file = "data_chula.csv"  # Replace with your CSV file path
data = pd.read_csv(input_file)

field_data = data[data["subjectArea"] == 'MEDI']
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    
# Prepare the text data for TF-IDF
sentences = field_data["extracted_keywords"]

# Compute the TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(sentences)

# Apply KMeans clustering (choose the number of clusters)
num_clusters = 5  # You can modify this depending on your data
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)

x = tfidf_matrix.toarray()
for i in x[0]:
  print(i)





0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [None]:
data.loc[data['subjectArea'] == 'MEDI']

Unnamed: 0.1,Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date,combined,extracted_keywords
4,5,The influence of neighbor effect and urbanizat...,Progress in Transplantation,natco rights reserved introduction population ...,Choice Consumer wellness Decision-making Neigh...,MEDI,01/03/2018,The influence of neighbor effect and urbanizat...,"donation , organ"
6,8,Lowered quality of life in mood disorders is a...,Journal of Evaluation in Clinical Practice,john wiley sons ltd. rationale aims major af...,bipolar disorder child abuse depressive disord...,MEDI,01/08/2018,Lowered quality of life in mood disorders is a...,"hrqol , tsh"
8,10,Current practice of diagnosis and management o...,Journal of Critical Care,elsevier inc. purpose resource limited setti...,Acute kidney injury ICU Practice Resource limi...,MEDI,01/08/2018,Current practice of diagnosis and management o...,"aki , settings"
11,15,Falls among physically active elderly in senio...,Clinical Interventions in Aging,maneeprom et al purpose mixed method study a...,Elderly Fall Fall prevention Perception Senior...,MEDI,01/01/2018,Falls among physically active elderly in senio...,"fall , falls"
12,16,Clif-sofa and urine neutrophil gelatinase-asso...,Journal of the Medical Association of Thailand,medical association thailand rights reserved o...,Acute-on-chronic liver failure Cirrhosis Morta...,MEDI,01/11/2018,Clif-sofa and urine neutrophil gelatinase-asso...,"clif , aclf"
...,...,...,...,...,...,...,...,...,...
16289,20185,Comparison of full-endoscopic and tubular-base...,European Spine Journal,author(s exclusive licence springer verlag gmb...,Decompression Full-endoscopic spine surgery Lu...,MEDI,01/08/2023,Comparison of full-endoscopic and tubular-base...,"decompression , surgery"
16291,20187,Effect of Smilax spp. and Phellinus linteus co...,BMC Complementary Medicine and Therapies,author(s).background prevalence breast cancer ...,Adjuvant drug Breast cancer Herbal medicine Ph...,MEDI,01/12/2023,Effect of Smilax spp. and Phellinus linteus co...,"pss , pl"
16299,20195,Clinical outcomes of low-dose pharmacokinetic-...,Haemophilia,john wiley sons ltd. introduction despite re...,coagulation factor VIII haemophilia A pharmaco...,MEDI,01/01/2023,Clinical outcomes of low-dose pharmacokinetic-...,"fviii , ehl"
16310,20206,Inhibition of histone deacetylase 6 destabiliz...,Journal of Biomedical Science,author(s).background leading cause cancer rela...,Extracellular signal-regulated kinase (ERK) Gl...,MEDI,01/12/2023,Inhibition of histone deacetylase 6 destabiliz...,"hdac6 , erk"


In [3]:
from gensim.models import Word2Vec
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')




In [54]:
import pandas as pd
import numpy as np

def toVector(word):
    try:
        l = list(wv[word])
        float_list = [float(num) for num in l]
        return float_list
    except:
        return None

df = pd.read_csv('merged_data_withkeywords.csv')
df['vector'] = df['One_keyword'].apply(lambda x: toVector(x))

df


Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date,combined,processed,Top_Three_Keywords,One_keyword,vector
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"EIS,Microstructure,Pitting corrosion,Polarizat...","MATE,PHYS",01/10/2018,Effects of iron content on the microstructure ...,effects iron content microstructure corrosion ...,"alloys, corrosion, exhibit",alloys,"[-0.05322265625, 0.5625, 0.33984375, 0.0639648..."
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"Critical factors,Innovation creation,Public un...","BUSI,ECON",01/01/2018,The critical factors of research and innovatio...,critical factors research innovation creation ...,"universities, creation, innovation",universities,"[-0.078125, -0.0003509521484375, 0.2412109375,..."
2,Is the occiput-wall distance valid and reliabl...,Musculoskeletal Science and Practice,© 2018Background: Hyperkyphosis may be frequen...,"Cobb angle,Dowager's hump,Round back,Spine",HEAL,01/12/2018,Is the occiput-wall distance valid and reliabl...,occiput-wall distance valid reliable determine...,"hyperkyphosis, owd, thoracic",hyperkyphosis,
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,"Agriculture land management,Conserved area,Soi...","PHAR,CHEM",01/01/2018,Comparison of soil composition between farmlan...,comparison soil composition farmlands conserve...,"farmlands, soils, conserved",farmlands,"[0.37109375, 0.33984375, 0.1455078125, -0.0898..."
4,The impact of wire caliber on ERCP outcomes: a...,Gastrointestinal Endoscopy,© 2018Background and Aims: Wire-guided biliary...,,MEDI,01/06/2018,The impact of wire caliber on ERCP outcomes: a...,impact wire caliber ercp outcomes multicenter ...,"inch, cannulation, wire",inch,"[-0.30078125, 0.07373046875, 0.09716796875, -0..."
...,...,...,...,...,...,...,...,...,...,...,...
22601,A SOM-Based Trajectory Planning Analysis Metho...,SAE Technical Papers,,,ENGI,31/12/2023,A SOM-Based Trajectory Planning Analysis Metho...,som-based trajectory planning analysis method ...,"som, trajectory, intelligent",som,"[-0.10107421875, -0.279296875, 0.0206298828125..."
22602,Overview and Research on Airworthiness and Saf...,SAE Technical Papers,,,"ENGI,MEDI",31/12/2023,Overview and Research on Airworthiness and Saf...,overview research airworthiness safety electri...,"airworthiness, propulsion, evtol",airworthiness,"[-0.66796875, -0.248046875, 0.158203125, -0.09..."
22603,Aeroengine Gas Path Parameter Trend Prediction...,SAE Technical Papers,,,ENGI,31/12/2023,Aeroengine Gas Path Parameter Trend Prediction...,aeroengine gas path parameter trend prediction...,"aeroengine, lstm, path",aeroengine,"[-0.1962890625, 0.06982421875, -0.022338867187..."
22604,A Wind Tunnel Investigation on the Aerodynamic...,SAE Technical Papers,,,ENGI,31/12/2023,A Wind Tunnel Investigation on the Aerodynamic...,wind tunnel investigation aerodynamics propuls...,"aerodynamics, propulsion, evtol",aerodynamics,"[0.330078125, 0.283203125, 0.248046875, -0.118..."


In [55]:
df.dropna(subset=['vector'], inplace=True)

df

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date,combined,processed,Top_Three_Keywords,One_keyword,vector
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"EIS,Microstructure,Pitting corrosion,Polarizat...","MATE,PHYS",01/10/2018,Effects of iron content on the microstructure ...,effects iron content microstructure corrosion ...,"alloys, corrosion, exhibit",alloys,"[-0.05322265625, 0.5625, 0.33984375, 0.0639648..."
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"Critical factors,Innovation creation,Public un...","BUSI,ECON",01/01/2018,The critical factors of research and innovatio...,critical factors research innovation creation ...,"universities, creation, innovation",universities,"[-0.078125, -0.0003509521484375, 0.2412109375,..."
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,"Agriculture land management,Conserved area,Soi...","PHAR,CHEM",01/01/2018,Comparison of soil composition between farmlan...,comparison soil composition farmlands conserve...,"farmlands, soils, conserved",farmlands,"[0.37109375, 0.33984375, 0.1455078125, -0.0898..."
4,The impact of wire caliber on ERCP outcomes: a...,Gastrointestinal Endoscopy,© 2018Background and Aims: Wire-guided biliary...,,MEDI,01/06/2018,The impact of wire caliber on ERCP outcomes: a...,impact wire caliber ercp outcomes multicenter ...,"inch, cannulation, wire",inch,"[-0.30078125, 0.07373046875, 0.09716796875, -0..."
5,The influence of neighbor effect and urbanizat...,Progress in Transplantation,"© 2017, NATCO. All rights reserved.Introductio...","Choice,Consumer wellness,Decision-making,Neigh...",MEDI,01/03/2018,The influence of neighbor effect and urbanizat...,influence neighbor effect urbanization toward ...,"donation, organ, neighbor",donation,"[0.028564453125, -0.146484375, -0.126953125, 0..."
...,...,...,...,...,...,...,...,...,...,...,...
22599,Research on Switchable Energy-Regenerative Sus...,SAE Technical Papers,,,"ENGI,ENER",31/12/2023,Research on Switchable Energy-Regenerative Sus...,research switchable energy-regenerative suspen...,"switchable, regenerative, suspension",switchable,"[0.0791015625, -0.029296875, -0.1728515625, -0..."
22601,A SOM-Based Trajectory Planning Analysis Metho...,SAE Technical Papers,,,ENGI,31/12/2023,A SOM-Based Trajectory Planning Analysis Metho...,som-based trajectory planning analysis method ...,"som, trajectory, intelligent",som,"[-0.10107421875, -0.279296875, 0.0206298828125..."
22602,Overview and Research on Airworthiness and Saf...,SAE Technical Papers,,,"ENGI,MEDI",31/12/2023,Overview and Research on Airworthiness and Saf...,overview research airworthiness safety electri...,"airworthiness, propulsion, evtol",airworthiness,"[-0.66796875, -0.248046875, 0.158203125, -0.09..."
22603,Aeroengine Gas Path Parameter Trend Prediction...,SAE Technical Papers,,,ENGI,31/12/2023,Aeroengine Gas Path Parameter Trend Prediction...,aeroengine gas path parameter trend prediction...,"aeroengine, lstm, path",aeroengine,"[-0.1962890625, 0.06982421875, -0.022338867187..."


In [62]:
vector_expanded = pd.DataFrame(df['vector'].tolist(), index=df.index)
vector_expanded.columns = [f'vector{i}' for i in range(vector_expanded.shape[1])]
df_expanded = pd.concat([df.drop(columns=['vector']), vector_expanded], axis=1)
df_expanded

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date,combined,processed,Top_Three_Keywords,One_keyword,...,vector290,vector291,vector292,vector293,vector294,vector295,vector296,vector297,vector298,vector299
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"EIS,Microstructure,Pitting corrosion,Polarizat...","MATE,PHYS",01/10/2018,Effects of iron content on the microstructure ...,effects iron content microstructure corrosion ...,"alloys, corrosion, exhibit",alloys,...,0.316406,0.064941,-0.190430,0.365234,-0.061768,-0.382812,-0.086426,0.267578,0.220703,-0.484375
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"Critical factors,Innovation creation,Public un...","BUSI,ECON",01/01/2018,The critical factors of research and innovatio...,critical factors research innovation creation ...,"universities, creation, innovation",universities,...,-0.304688,-0.112305,-0.468750,0.300781,0.176758,0.010437,-0.073730,0.077637,-0.000713,0.150391
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,"Agriculture land management,Conserved area,Soi...","PHAR,CHEM",01/01/2018,Comparison of soil composition between farmlan...,comparison soil composition farmlands conserve...,"farmlands, soils, conserved",farmlands,...,-0.511719,-0.202148,-0.257812,0.044678,0.053711,-0.053223,-0.376953,-0.176758,0.378906,-0.238281
4,The impact of wire caliber on ERCP outcomes: a...,Gastrointestinal Endoscopy,© 2018Background and Aims: Wire-guided biliary...,,MEDI,01/06/2018,The impact of wire caliber on ERCP outcomes: a...,impact wire caliber ercp outcomes multicenter ...,"inch, cannulation, wire",inch,...,-0.056396,-0.164062,-0.232422,-0.119629,0.193359,-0.221680,-0.320312,0.037598,-0.098633,-0.184570
5,The influence of neighbor effect and urbanizat...,Progress in Transplantation,"© 2017, NATCO. All rights reserved.Introductio...","Choice,Consumer wellness,Decision-making,Neigh...",MEDI,01/03/2018,The influence of neighbor effect and urbanizat...,influence neighbor effect urbanization toward ...,"donation, organ, neighbor",donation,...,-0.208984,0.021362,-0.296875,-0.133789,0.353516,-0.164062,0.103027,0.021606,-0.172852,-0.224609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22599,Research on Switchable Energy-Regenerative Sus...,SAE Technical Papers,,,"ENGI,ENER",31/12/2023,Research on Switchable Energy-Regenerative Sus...,research switchable energy-regenerative suspen...,"switchable, regenerative, suspension",switchable,...,0.211914,0.396484,0.203125,0.199219,-0.062988,-0.015747,-0.139648,-0.077148,-0.106445,-0.115234
22601,A SOM-Based Trajectory Planning Analysis Metho...,SAE Technical Papers,,,ENGI,31/12/2023,A SOM-Based Trajectory Planning Analysis Metho...,som-based trajectory planning analysis method ...,"som, trajectory, intelligent",som,...,-0.011108,0.308594,0.163086,0.175781,0.108398,-0.055420,0.002991,0.105957,-0.131836,0.259766
22602,Overview and Research on Airworthiness and Saf...,SAE Technical Papers,,,"ENGI,MEDI",31/12/2023,Overview and Research on Airworthiness and Saf...,overview research airworthiness safety electri...,"airworthiness, propulsion, evtol",airworthiness,...,0.166016,0.040283,0.134766,0.091309,0.194336,0.093750,-0.326172,0.128906,0.154297,-0.441406
22603,Aeroengine Gas Path Parameter Trend Prediction...,SAE Technical Papers,,,ENGI,31/12/2023,Aeroengine Gas Path Parameter Trend Prediction...,aeroengine gas path parameter trend prediction...,"aeroengine, lstm, path",aeroengine,...,-0.107422,-0.045898,-0.104492,0.092285,0.163086,-0.052979,0.066895,0.160156,0.087402,-0.163086


In [63]:
df.to_csv("merged_data_withkeywords&vectors_dropNone", index=False)

In [52]:
df = pd.read_csv('merged_data_withkeywords&vectors_dropNone2.csv')
df

0        [-5.32226562e-02  5.62500000e-01  3.39843750e-...
1        [-7.81250000e-02 -3.50952148e-04  2.41210938e-...
2        [ 3.71093750e-01  3.39843750e-01  1.45507812e-...
3        [-0.30078125  0.07373047  0.09716797 -0.131835...
4        [ 0.02856445 -0.14648438 -0.12695312  0.205078...
                               ...                        
15136    [ 7.91015625e-02 -2.92968750e-02 -1.72851562e-...
15137    [-0.10107422 -0.27929688  0.02062988  0.097167...
15138    [-0.66796875 -0.24804688  0.15820312 -0.094238...
15139    [-1.96289062e-01  6.98242188e-02 -2.23388672e-...
15140    [ 0.33007812  0.28320312  0.24804688 -0.118652...
Name: vector, Length: 15141, dtype: object

In [61]:
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np


#input_file = "merged_data_withkeywords&vectors_dropNone3.csv" 
#data = pd.read_csv(input_file)
data = df_expanded.copy()
# Apply KMeans clustering for each field of study
field_of_study_list = data["subjectArea"].unique()  # List of unique fields of study

# Create a dictionary to store the clustering results by field of study
field_clustering_results = {}




for field in field_of_study_list:
    # Filter the data for the current field of study
    field_data = data[data["subjectArea"] == field]
    
    # Prepare the text data for TF-IDF
    l = []
    for i in range(300):
        l.append(f"vector{i}")
    vector = data[l]
    
    # Apply KMeans clustering (choose the number of clusters)
    num_clusters = 5  # You can modify this depending on your data
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(vector)
    
    # Add the cluster labels to the DataFrame for the current field of study
    #field_data['cluster'] = kmeans.labels_
    print(kmeans.labels_)
    
    # Identify the largest cluster by size
    largest_cluster = np.argmax(np.bincount(kmeans.labels_))
    
    # Get the indices of the largest cluster
    largest_cluster_indices = np.where(kmeans.labels_ == largest_cluster)[0]
    
    # Find the centroid of the largest cluster
    centroid = kmeans.cluster_centers_[largest_cluster]
    
    # Get the top keywords of the centroid (sorted by their importance)
    top_keywords_indices = centroid.argsort()[-10:][::-1]  # Top 10 keywords
    print(top_keywords_indices)
    #top_keywords = [vectorizer.get_feature_names_out()[i] for i in top_keywords_indices]
    
    # Store the results in the dictionary
    field_clustering_results[field] = {
        'top_keywords': top_keywords,
        'field_data': field_data,
        'largest_cluster': largest_cluster,
        'cluster_centroids': kmeans.cluster_centers_
    }

# Print the top keywords for each field of study
for field, results in field_clustering_results.items():
    print(f"Field of Study: {field}")
    print(f"Top Keywords: {results['top_keywords']}")
    print()

ValueError: Length of values (15141) does not match length of index (42)

In [8]:
import pandas as pd
df = pd.read_csv('merged_data_withkeywords.csv')
df

Unnamed: 0,title,publicationName,abstract,keywords,subjectArea,publication_date,combined,processed,Top_Three_Keywords,One_keyword
0,Effects of iron content on the microstructure ...,Materials Chemistry and Physics,© 2018The microstructure and corrosion behavio...,"EIS,Microstructure,Pitting corrosion,Polarizat...","MATE,PHYS",01/10/2018,Effects of iron content on the microstructure ...,effects iron content microstructure corrosion ...,"alloys, corrosion, exhibit",alloys
1,The critical factors of research and innovatio...,International Journal of Trade and Global Markets,Copyright © 2018 Inderscience Enterprises Ltd....,"Critical factors,Innovation creation,Public un...","BUSI,ECON",01/01/2018,The critical factors of research and innovatio...,critical factors research innovation creation ...,"universities, creation, innovation",universities
2,Is the occiput-wall distance valid and reliabl...,Musculoskeletal Science and Practice,© 2018Background: Hyperkyphosis may be frequen...,"Cobb angle,Dowager's hump,Round back,Spine",HEAL,01/12/2018,Is the occiput-wall distance valid and reliabl...,occiput-wall distance valid reliable determine...,"hyperkyphosis, owd, thoracic",hyperkyphosis
3,Comparison of soil composition between farmlan...,Eurasian Journal of Analytical Chemistry,© 2018 Society for Innovative Research. All ri...,"Agriculture land management,Conserved area,Soi...","PHAR,CHEM",01/01/2018,Comparison of soil composition between farmlan...,comparison soil composition farmlands conserve...,"farmlands, soils, conserved",farmlands
4,The impact of wire caliber on ERCP outcomes: a...,Gastrointestinal Endoscopy,© 2018Background and Aims: Wire-guided biliary...,,MEDI,01/06/2018,The impact of wire caliber on ERCP outcomes: a...,impact wire caliber ercp outcomes multicenter ...,"inch, cannulation, wire",inch
...,...,...,...,...,...,...,...,...,...,...
22601,A SOM-Based Trajectory Planning Analysis Metho...,SAE Technical Papers,,,ENGI,31/12/2023,A SOM-Based Trajectory Planning Analysis Metho...,som-based trajectory planning analysis method ...,"som, trajectory, intelligent",som
22602,Overview and Research on Airworthiness and Saf...,SAE Technical Papers,,,"ENGI,MEDI",31/12/2023,Overview and Research on Airworthiness and Saf...,overview research airworthiness safety electri...,"airworthiness, propulsion, evtol",airworthiness
22603,Aeroengine Gas Path Parameter Trend Prediction...,SAE Technical Papers,,,ENGI,31/12/2023,Aeroengine Gas Path Parameter Trend Prediction...,aeroengine gas path parameter trend prediction...,"aeroengine, lstm, path",aeroengine
22604,A Wind Tunnel Investigation on the Aerodynamic...,SAE Technical Papers,,,ENGI,31/12/2023,A Wind Tunnel Investigation on the Aerodynamic...,wind tunnel investigation aerodynamics propuls...,"aerodynamics, propulsion, evtol",aerodynamics
