In [13]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

nlp = spacy.load("en_core_web_sm")

dataset = pd.read_excel('/kaggle/input/dataset2/nlpdataset.xlsx')  
print(dataset.head())  

def extract_keywords_tfidf(paragraphs, top_n=8):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=top_n)
    tfidf_matrix = vectorizer.fit_transform(paragraphs)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = np.asarray(tfidf_matrix.sum(axis=0)).flatten()

    keyword_scores = zip(feature_names, tfidf_scores)
    sorted_keywords = sorted(keyword_scores, key=lambda x: x[1], reverse=True)
    return [keyword for keyword, _ in sorted_keywords[:top_n]]

def extract_keyphrases_spacy(paragraphs, top_n=8):
    keyphrases_list = []
    for text in paragraphs:
        doc = nlp(text)
        keyphrases = [chunk.text for chunk in doc.noun_chunks if len(chunk.text.split()) > 1]
        keyphrases_list.append(keyphrases[:top_n])
    return keyphrases_list

text_data = dataset['answers'].astype(str).tolist()

keyphrases = extract_keyphrases_spacy(text_data)

dataset['keyphrases'] = keyphrases

print(dataset[['answers', 'keyphrases']].head())  

                                           questions  \
0              What is supervised machine learning?    
1  What is regression? Which models can you use t...   
2     What is linear regression? When do we use it?    
3  What are the main assumptions of linear regres...   
4  What is the normal distribution? Why do we car...   

                                             answers  
0  Supervised machine learning is a type of machi...  
1  Regression is a type of supervised machine lea...  
2  Linear regression is a statistical method used...  
3  The main assumptions of linear regression are:...  
4  The normal distribution is a symmetric, bell-s...  
                                             answers  \
0  Supervised machine learning is a type of machi...   
1  Regression is a type of supervised machine lea...   
2  Linear regression is a statistical method used...   
3  The main assumptions of linear regression are:...   
4  The normal distribution is a symmetric, bell-s... 

In [14]:
dataset.to_csv('/kaggle/working/updated_dataset.csv', index=False)