##Rake

In [1]:
pip install rake-nltk gensim stopwords



In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [3]:
# Importing libraries
from rake_nltk import Rake
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
# Initializing the Rake instance
rake = Rake()

# Input text
input_text = '''
NLP stands for Natural Language Processing.
It is the branch of Artificial Intelligence that gives the ability to machine understand
and process human languages. Human languages can be in the form of text or audio format.
Natural Language Processing started in 1950 When Alan Mathison Turing published
an article in the name Computing Machinery and Intelligence.
It is based on Artificial intelligence. It talks about automatic interpretation and
generation of natural language.
As the technology evolved, different approaches have come to deal with NLP tasks.
'''

# Extracting keywords and phrases
rake.extract_keywords_from_text(input_text)
keywords = rake.get_ranked_phrases_with_scores()

df = pd.DataFrame(keywords, columns=['Score', 'Keyword'])

# Displaying the table
print(df)

        Score                              Keyword
0   16.000000       alan mathison turing published
1   13.500000  natural language processing started
2    9.500000          natural language processing
3    9.000000             name computing machinery
4    8.000000              process human languages
5    6.000000                     natural language
6    5.000000                      human languages
7    4.000000                   technology evolved
8    4.000000                            nlp tasks
9    4.000000                           nlp stands
10   4.000000                   machine understand
11   4.000000                 different approaches
12   4.000000             automatic interpretation
13   4.000000                         audio format
14   3.666667              artificial intelligence
15   3.666667              artificial intelligence
16   1.666667                         intelligence
17   1.000000                                 text
18   1.000000                  

In [35]:
import pandas as pd
from rake_nltk import Rake
from nltk.tokenize import word_tokenize
import nltk
from collections import defaultdict

nltk.download('punkt')

# Function to extract individual word scores from RAKE phrases
def get_rake_word_scores(text):
    """
    Extracts RAKE phrases, splits them into words, assigns each word the phrase score,
    and aggregates scores for words appearing in multiple phrases.

    Returns: dict {word: aggregated_score}
    """
    if not isinstance(text, str):
        return {}

    rake = Rake()
    rake.extract_keywords_from_text(text)
    phrases = rake.get_ranked_phrases_with_scores()  # list of (score, phrase)

    word_scores = defaultdict(float)

    for score, phrase in phrases:
        words = word_tokenize(phrase.lower())
        for word in words:
            if word.isalnum():  # skip punctuation
                word_scores[word] += score  # accumulate score if word appears multiple times
    word_scores = {word: round(score, 3) for word, score in word_scores.items()}
    return dict(word_scores)

# Step 1: Load preprocessed text
df = pd.read_csv("preprocessed_text.csv")

# Step 2: Apply function to each review
df['RAKE_Word_Scores'] = df['Preprocessed_Text'].apply(get_rake_word_scores)

# Step 3: Save output
df[['Id', 'RAKE_Word_Scores']].to_csv("rake_word_scores.csv", index=False)
print("✅ RAKE word-level scores saved to rake_word_scores.csv")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


✅ RAKE word-level scores saved to rake_word_scores.csv


##Text Rank

In [4]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Word2Vec
from scipy import spatial
import networkx as nx

In [5]:
import re
import numpy as np
import networkx as nx
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from itertools import combinations

# Input text
text = '''NLP stands for Natural Language Processing.
It is the branch of Artificial Intelligence that gives the ability to machine understand
and process human languages. Human languages can be in the form of text or audio format.
Natural Language Processing started in 1950 When Alan Mathison Turing published
an article in the name Computing Machinery and Intelligence.
It is based on Artificial intelligence. It talks about automatic interpretation and
generation of natural language.
As the technology evolved, different approaches have come to deal with NLP tasks.'''

# Preprocessing
stop_words = set(stopwords.words('english'))
words_clean = [word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stop_words]

# Build co-occurrence graph
window_size = 3  # Words are connected if they appear within a window of 3 words
edges = []
for i, word in enumerate(words_clean):
    for j in range(i + 1, min(i + window_size, len(words_clean))):
        edges.append((word, words_clean[j]))

# Create graph
graph = nx.Graph()
graph.add_edges_from(edges)

# Apply TextRank (PageRank) to rank words
word_ranks = nx.pagerank(graph)

# Rank words by their TextRank score
ranked_words = sorted(word_ranks.items(), key=lambda x: x[1], reverse=True)

# Display top 10 words
top_words = ranked_words[:10]
print("Top ranked words using TextRank:")
for word, score in top_words:
    print(f"{word}: {score:.4f}")

Top ranked words using TextRank:
intelligence: 0.0453
natural: 0.0448
language: 0.0445
artificial: 0.0307
processing: 0.0302
nlp: 0.0287
human: 0.0272
languages: 0.0271
deal: 0.0244
come: 0.0236


In [45]:
import networkx as nx
from collections import defaultdict

def get_textrank_scores_from_tokens(tokens, window_size=3):
    """
    Applies TextRank (PageRank) to a list of preprocessed tokens.

    Parameters:
        tokens (list): List of preprocessed word tokens.
        window_size (int): Window size for co-occurrence.

    Returns:
        dict: {word: TextRank score (rounded to 4 decimals)}
    """
    if not isinstance(tokens, list) or len(tokens) < 2:
        return {}

    edges = []
    for i, word in enumerate(tokens):
        for j in range(i + 1, min(i + window_size, len(tokens))):
            edges.append((word, tokens[j]))

    graph = nx.Graph()
    graph.add_edges_from(edges)

    if graph.number_of_nodes() == 0:
        return {}

    word_ranks = nx.pagerank(graph)
    word_ranks = {word: round(score, 3) for word, score in word_ranks.items()}
    return word_ranks


In [8]:
import pandas as pd
import ast

# Load and parse the token list from CSV
df = pd.read_csv("preprocessed_text.csv")
df['Preprocessed_Text'] = df['Preprocessed_Text'].apply(ast.literal_eval)

# Apply the updated TextRank function
df['TextRank_Word_Scores'] = df['Preprocessed_Text'].apply(get_textrank_scores_from_tokens)

# Save results
df[['Id', 'TextRank_Word_Scores']].to_csv("textrank_word_scores.csv", index=False)
print("✅ TextRank word-level scores saved to textrank_word_scores.csv")


✅ CF-IOF word-level scores saved to cf_iof_word_scores.csv


##CF-IOF

In [1]:
pip install senticnet



In [2]:
! wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
! unzip v0.9.2.zip
! cd fastText-0.9.2
! make
!mingw32-make

--2025-06-12 16:01:58--  https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://codeload.github.com/facebookresearch/fastText/zip/refs/tags/v0.9.2 [following]
--2025-06-12 16:01:58--  https://codeload.github.com/facebookresearch/fastText/zip/refs/tags/v0.9.2
Resolving codeload.github.com (codeload.github.com)... 140.82.113.10
Connecting to codeload.github.com (codeload.github.com)|140.82.113.10|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/zip]
Saving to: ‘v0.9.2.zip’

v0.9.2.zip              [   <=>              ]   4.17M  7.27MB/s    in 0.6s    

2025-06-12 16:01:59 (7.27 MB/s) - ‘v0.9.2.zip’ saved [4369852]

Archive:  v0.9.2.zip
5b5943c118b0ec5fb9cd8d20587de2b2d3966dfe
   creating: fastText-0.9.2/
   creating: fastText-0.9.2/.circleci/
 

In [3]:
!pip install fasttext
!git clone https://github.com/facebookresearch/fastText.git
%cd fastText
!pip install .


fatal: destination path 'fastText' already exists and is not an empty directory.
/content/fastText
Processing /content/fastText
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: fasttext
[31mERROR: Operation cancelled by user[0m[31m
[0m  Building wheel for fasttext (pyproject.toml) ... [?25l[?25h

KeyboardInterrupt: 

In [2]:
# Install required libraries
!pip install senticnet fasttext tabulate



In [3]:
# Download a pre-trained FastText word embedding model
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gzip -d cc.en.300.bin.gz

--2025-06-12 16:09:16--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 18.154.144.87, 18.154.144.102, 18.154.144.74, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|18.154.144.87|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2025-06-12 16:09:55 (111 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [4]:
import fasttext

# Load the pre-trained FastText word embedding model
fasttext_model = fasttext.load_model("cc.en.300.bin")

In [5]:
from senticnet.senticnet import SenticNet
import fasttext

import numpy as np
sn = SenticNet()
def get_polarity(word):
    try:
        # Check SenticNet for polarity
        polarity = sn.polarity_value(word)
        print(f"Found in SenticNet: {word} -> {polarity}")
        return float(polarity)  # Convert polarity to float
    except KeyError:
        # If word is not found in SenticNet, use FastText for prediction
        print(f"Not found in SenticNet. Using FastText for: {word}")
        try:
            # Get the word vector from FastText
            word_vector = fasttext_model.get_word_vector(word)
            # Calculate polarity as the sum of the word vector components
            polarity = np.sum(word_vector)
            return polarity
        except KeyError:
            # If word is not in FastText vocabulary, return neutral polarity
            return 0.0

In [6]:
from tabulate import tabulate
from collections import defaultdict # Import defaultdict from collections
import numpy as np
from tabulate import tabulate
import numpy as np
from collections import defaultdict
from tabulate import tabulate


def get_word_polarities(text):
    bins = np.linspace(-1, 1, num=6)  # 6 edges create 5 bins
    labels = [1, 2, 3, 4, 5]  # Bin labels

    words = text.split()
    word_polarities = []
    feature_bin_map = {}  # Store word and assigned bin

    for word in words:
        polarity = get_polarity(word)
        if polarity < -0.6:
            bin_value = 1
        elif polarity < -0.2:
            bin_value = 2
        elif polarity < 0.2:
            bin_value = 3
        elif polarity < 0.6:
            bin_value = 4
        else:
            bin_value = 5
        word_polarities.append([word, polarity, bin_value])
        feature_bin_map[word] = bin_value

    print(tabulate(word_polarities, headers=["Word", "Polarity", "Bin"], tablefmt="pretty"))
    return feature_bin_map  # Return dictionary of words and assigned bins

def compute_cf_iof(reviews):
    feature_counts = defaultdict(lambda: defaultdict(int))  # {feature: {bin: count}}
    total_bin_counts = defaultdict(int)  # {bin: total count of all features}
    review_counts = defaultdict(int)  # {feature: number of reviews containing feature}

    for review in reviews:
        feature_bin_map = get_word_polarities(review)
        seen_features = set()  # Track features seen in this review

        for feature, bin_value in feature_bin_map.items():
            feature_counts[feature][bin_value] += 1
            total_bin_counts[bin_value] += 1
            if feature not in seen_features:
                review_counts[feature] += 1
                seen_features.add(feature)

    # Compute CF-IOF Scores
    cf_iof_scores = {}
    for feature, bins in feature_counts.items():
        cf_iof_score = 0
        for bin_value, n_ij in bins.items():
            if total_bin_counts[bin_value] > 0:
                tf = n_ij / total_bin_counts[bin_value]  # Term frequency
                idf = np.log(review_counts[feature] + 1)  # Log of number of reviews containing feature
                cf_iof_score += tf * idf

        cf_iof_scores[feature] = cf_iof_score

    # Display results
    sorted_scores = sorted(cf_iof_scores.items(), key=lambda x: x[1], reverse=True)
    print(tabulate(sorted_scores, headers=["Feature", "CF-IOF Score"], tablefmt="pretty"))

    return cf_iof_scores

# Example reviews dataset
reviews = [
    "NLP is a great field of study with amazing potential.",
    "Sentiment analysis is crucial for understanding human emotions.",
    "Natural Language Processing is an evolving technology that relies on AI.",
    "Machine learning and deep learning improve NLP models significantly.",
    "The advancements in AI are making NLP more efficient."
]

# Compute CF-IOF scores
compute_cf_iof(reviews)


Not found in SenticNet. Using FastText for: NLP
Not found in SenticNet. Using FastText for: is
Not found in SenticNet. Using FastText for: a
Found in SenticNet: great -> 0.857
Not found in SenticNet. Using FastText for: field
Not found in SenticNet. Using FastText for: of
Not found in SenticNet. Using FastText for: study
Not found in SenticNet. Using FastText for: with
Found in SenticNet: amazing -> 0.956
Not found in SenticNet. Using FastText for: potential.
+------------+----------------------+-----+
|    Word    |       Polarity       | Bin |
+------------+----------------------+-----+
|    NLP     |  0.4682713449001312  |  4  |
|     is     |  -4.307405948638916  |  1  |
|     a      |  1.995366096496582   |  5  |
|   great    |        0.857         |  5  |
|   field    | -0.9385746121406555  |  1  |
|     of     |  2.585928201675415   |  5  |
|   study    | 0.40166041254997253  |  4  |
|    with    |  0.2396959662437439  |  4  |
|  amazing   |        0.956         |  5  |
| potent

{'NLP': np.float64(0.46209812037329684),
 'is': np.float64(0.34657359027997264),
 'a': np.float64(0.04077336356234972),
 'great': np.float64(0.04077336356234972),
 'field': np.float64(0.057762265046662105),
 'of': np.float64(0.04077336356234972),
 'study': np.float64(0.07701635339554948),
 'with': np.float64(0.07701635339554948),
 'amazing': np.float64(0.04077336356234972),
 'potential.': np.float64(0.13862943611198905),
 'Sentiment': np.float64(0.07701635339554948),
 'analysis': np.float64(0.23104906018664842),
 'crucial': np.float64(0.04077336356234972),
 'for': np.float64(0.07701635339554948),
 'understanding': np.float64(0.04077336356234972),
 'human': np.float64(0.04077336356234972),
 'emotions.': np.float64(0.23104906018664842),
 'Natural': np.float64(0.04077336356234972),
 'Language': np.float64(0.04077336356234972),
 'Processing': np.float64(0.07701635339554948),
 'an': np.float64(0.04077336356234972),
 'evolving': np.float64(0.13862943611198905),
 'technology': np.float64(0.04

In [1]:
from senticnet.senticnet import SenticNet
import fasttext
import numpy as np

# Load SenticNet and FastText model (ensure you have cc.en.300.bin)
sn = SenticNet()
fasttext_model = fasttext.load_model("cc.en.300.bin")

def get_polarity(word):
    try:
        return float(sn.polarity_value(word))
    except KeyError:
        try:
            vec = fasttext_model.get_word_vector(word)
            return np.sum(vec)
        except:
            return 0.0

def get_polarity_bin(polarity):
    if polarity < -0.6:
        return 1
    elif polarity < -0.2:
        return 2
    elif polarity < 0.2:
        return 3
    elif polarity < 0.6:
        return 4
    else:
        return 5


In [3]:
from collections import defaultdict

def compute_cf_iof_scores(token_lists):
    """
    Compute global CF-IOF scores for all features across reviews.

    token_lists: List of lists, where each sublist is a tokenized review.

    Returns:
        dict of {feature: CF-IOF score}
    """
    feature_counts = defaultdict(lambda: defaultdict(int))  # {feature: {bin: count}}
    total_bin_counts = defaultdict(int)                     # {bin: total count across all features}
    review_counts = defaultdict(int)                        # {feature: number of reviews it appears in}

    for tokens in token_lists:
        seen_features = set()
        for word in tokens:
            polarity = get_polarity(word)
            bin_value = get_polarity_bin(polarity)

            feature_counts[word][bin_value] += 1
            total_bin_counts[bin_value] += 1
            if word not in seen_features:
                review_counts[word] += 1
                seen_features.add(word)

    # Calculate final CF-IOF score
    cf_iof_scores = {}
    for word, bin_data in feature_counts.items():
        score = 0
        for bin_value, freq in bin_data.items():
            tf = freq / total_bin_counts[bin_value]
            idf = np.log(1 + review_counts[word])
            score += tf * idf
        cf_iof_scores[word] = round(score, 4)

    return cf_iof_scores


In [9]:
token_lists = df['Preprocessed_Text'].tolist()
cf_iof_scores = compute_cf_iof_scores(token_lists)

# Step 4: Map per-review word-level CF-IOF scores
df['CF_IOF_Word_Scores'] = df['Preprocessed_Text'].apply(
    lambda tokens: {word: cf_iof_scores.get(word, 0.0) for word in tokens}
)

# Step 5: Save only Id + word scores in original order
df[['Id', 'CF_IOF_Word_Scores']].to_csv("cf_iof_word_scores.csv", index=False)
print("✅ CF-IOF word-level scores saved to cf_iof_word_scores.csv")

✅ CF-IOF word-level scores saved to cf_iof_word_scores.csv


#PSO

In [4]:
pip install pyswarms


Collecting pyswarms
  Downloading pyswarms-1.3.0-py2.py3-none-any.whl.metadata (33 kB)
Downloading pyswarms-1.3.0-py2.py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyswarms
Successfully installed pyswarms-1.3.0


In [2]:
pip install rouge-score


Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=c23b6b00539da1f426d609ce7b744dbdcccc07bdecece909cab5e67fe5ed4273
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [4]:
from rouge_score import rouge_scorer
import pandas as pd
import ast

# Load your CSV
df = pd.read_csv("preprocessed_text.csv")
df['Preprocessed_Text'] = df['Preprocessed_Text'].apply(ast.literal_eval)
df['Preprocessed_Summary'] = df['Preprocessed_Summary'].apply(ast.literal_eval)

# Prepare summaries
df['Generated_Summary'] = df['Preprocessed_Text'].apply(lambda tokens: ' '.join(tokens[:5]))
df['Reference_Summary'] = df['Preprocessed_Summary'].apply(lambda tokens: ' '.join(tokens))

# Compute ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
df['ROUGE_L'] = df.apply(lambda row: scorer.score(row['Reference_Summary'], row['Generated_Summary'])['rougeL'].fmeasure, axis=1)

# View results
print("Average ROUGE-L:", df['ROUGE_L'].mean())


Average ROUGE-L: 0.13528916472416472


In [7]:
import pandas as pd
import ast
import numpy as np
from rouge_score import rouge_scorer
from pyswarms.single import GlobalBestPSO
from functools import partial

# Step 1: Load score files
rake = pd.read_csv("rake_word_scores.csv")
textrank = pd.read_csv("textrank_word_scores.csv")
cfiof = pd.read_csv("cf_iof_word_scores.csv")
textdata = pd.read_csv("preprocessed_text.csv")

# Step 2: Parse dictionaries
rake['RAKE_Word_Scores'] = rake['RAKE_Word_Scores'].apply(ast.literal_eval)
textrank['TextRank_Word_Scores'] = textrank['TextRank_Word_Scores'].apply(ast.literal_eval)
cfiof['CF_IOF_Word_Scores'] = cfiof['CF_IOF_Word_Scores'].apply(ast.literal_eval)
textdata['Preprocessed_Text'] = textdata['Preprocessed_Text'].apply(ast.literal_eval)
textdata['Preprocessed_Summary'] = textdata['Preprocessed_Summary'].apply(ast.literal_eval)

# Step 3: Build ID-based dicts
rake_dict = dict(zip(rake['Id'], rake['RAKE_Word_Scores']))
textrank_dict = dict(zip(textrank['Id'], textrank['TextRank_Word_Scores']))
cfiof_dict = dict(zip(cfiof['Id'], cfiof['CF_IOF_Word_Scores']))
ref_summaries = dict(zip(textdata['Id'], textdata['Preprocessed_Summary']))

# Step 4: Fitness Function (maximize ROUGE-L)
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def fitness(weights, rake_scores, textrank_scores, cfiof_scores, refs):
    fitness_scores = []

    for particle in weights:
        w1, w2, w3 = particle
        rouge_list = []

        for idx in rake_scores.keys():
            # Combine scores
            combined = {}
            words = set(rake_scores[idx]) | set(textrank_scores[idx]) | set(cfiof_scores[idx])
            for word in words:
                r = rake_scores[idx].get(word, 0)
                t = textrank_scores[idx].get(word, 0)
                c = cfiof_scores[idx].get(word, 0)
                combined[word] = w1*r + w2*t + w3*c

            top_k=len(refs[idx])
            top_words = sorted(combined.items(), key=lambda x: x[1], reverse=True)[:top_k]
            gen_summary = ' '.join([w for w, _ in top_words])
            ref_summary = ' '.join(refs[idx])

            rouge = scorer.score(ref_summary, gen_summary)['rougeL'].fmeasure
            rouge_list.append(rouge)

        fitness_scores.append(np.mean(rouge_list))

    return -np.array(fitness_scores)  # negative for minimization

# Step 5: PSO Optimization
bounds = (np.array([0, 0, 0]), np.array([1, 1, 1]))  # Weights in [0,1]
objective = partial(fitness, rake_scores=rake_dict, textrank_scores=textrank_dict,
                    cfiof_scores=cfiof_dict, refs=ref_summaries)

optimizer = GlobalBestPSO(n_particles=15, dimensions=3,
                          options={'c1': 0.5, 'c2': 0.3, 'w': 0.9},
                          bounds=bounds)

best_cost, best_weights = optimizer.optimize(objective, iters=50)

print("\n✅ Best Weights Found:")
print("RAKE    (w₁):", round(best_weights[0], 4))
print("TextRank(w₂):", round(best_weights[1], 4))
print("CF-IOF  (w₃):", round(best_weights[2], 4))
print("Best ROUGE-L Score:", round(-best_cost, 4))  # Reverse sign back


2025-06-12 16:46:20,533 - absl - INFO - Using default tokenizer.
2025-06-12 16:46:20,547 - pyswarms.single.global_best - INFO - Optimize for 50 iters with {'c1': 0.5, 'c2': 0.3, 'w': 0.9}
pyswarms.single.global_best: 100%|██████████|50/50, best_cost=-0.138
2025-06-12 16:59:18,681 - pyswarms.single.global_best - INFO - Optimization finished | best cost: -0.13769616106116106, best pos: [0.25608473 0.30491818 0.01127086]



✅ Best Weights Found:
RAKE    (w₁): 0.2561
TextRank(w₂): 0.3049
CF-IOF  (w₃): 0.0113
Best ROUGE-L Score: 0.1377


In [9]:

def get_final_feature_scores(rake_scores, textrank_scores, cfiof_scores, ref_summary_tokens,
                             weights=(0.4, 0.3, 0.3)):
    w1, w2, w3 = weights
    all_tokens = set(rake_scores) | set(textrank_scores) | set(cfiof_scores)

    final_scores = {}
    for token in all_tokens:
        r = rake_scores.get(token, 0)
        t = textrank_scores.get(token, 0)
        c = cfiof_scores.get(token, 0)
        final_score = w1 * r + w2 * t + w3 * c
        final_scores[token] = round(final_score, 4)

    k = len(ref_summary_tokens)
    top_k = sorted(final_scores.items(), key=lambda x: x[1], reverse=True)[:k]

    return dict(top_k)

# ✅ Generate top-k feature scores for each review
results = []

for review_id in rake_dict.keys():
    final_scores = get_final_feature_scores(
        rake_dict[review_id],
        textrank_dict[review_id],
        cfiof_dict[review_id],
        ref_summaries[review_id],
        weights=best_weights
    )
    results.append({'Id': review_id, 'Final_Scores': final_scores})

# ✅ Save to CSV
final_df = pd.DataFrame(results)
final_df.to_csv("final_weighted_scores.csv", index=False)

print("✅ Saved final_weighted_scores.csv with top-k token scores per review.")

✅ Saved final_weighted_scores.csv with top-k token scores per review.
