<a href="https://colab.research.google.com/github/hristijanpeshov/SHAP-Explainable-Lexicon-Model/blob/master/notebooks/FinBERT%20notebooks/FinBERT_coefficients_grid_search.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# User Input

In [None]:
# enter the location of lexicons (please make sure that there are only lexicons files in the folder)
lexicons_folder_map = {
    'nasdaq': '/content/drive/MyDrive/finbert process/nasdaq/concatenated datasets/lexicons',
    'fpb': '/content/drive/MyDrive/finbert process/fpb/concatenated datasets/lexicons',
    'sentfin': '/content/drive/MyDrive/finbert process/sentfin/concatenated datasets/lexicons'
}

# enter the location of all evaluation datasets (please make sure that there are only evaluation files in the folder)
eval_datasets_folder_loc = '/content/drive/MyDrive/datasets/evaluation datasets'

# enter the location where the best performance coefficient results should be saved
best_coef_results_loc = '/content/drive/MyDrive/'

# ShapDictModel

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, precision_score, f1_score, recall_score, matthews_corrcoef
import re
from sklearn.metrics import confusion_matrix
import torch
import nltk
import torch
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

class ShapDictModel:
  def __init__(self, dataset, tokenizer, word_column, category_column, decision_makers, count_column, dataset_source = None):
    self.dataset_source = str.upper(dataset_source) if dataset_source is not None else 'both'
    self.word_column = word_column
    self.tokenizer = tokenizer
    self.count_column = count_column

    # Lemmatizer
    self.lemmatizer = WordNetLemmatizer()

    # source column values
    self.lm_source = 'LM'
    self.tm_source = 'OUR_WORDS'

    # column prefix
    self.lm_prefix = 'LM_'
    self.tm_prefix = 'TM_'

    # category values
    self.positive_category_value = 'positive'
    self.negative_category_value = 'negative'

    # opposite prefix
    self.opposite_prefix = 'opposite_'

    # number of required coefficient when both sources or one sorce
    self.coefficient_number_both_sources = 4
    self.coefficient_number_one_source = 2

    # prefix when one source is chosen
    self.prefix = 'TM_' if self.dataset_source == self.tm_source else 'LM_' if self.dataset_source == self.lm_source else ''

    # source column postfix
    self.source_column = 'src'

    # dataset on which results are calculated
    self.dataset = dataset if self.dataset_source == 'both' else self.extract_dataset_from_source(dataset, self.dataset_source)

    # category when one source is chosen
    self.category = category_column

    # decision makers
    self.decision_makers = decision_makers
    print()
    print(f'Created ShapDictModel with decision makers: {self.decision_makers}')
    print()


  def extract_dataset_from_source(self, dataset, source, only_source_columns=True):
    # depending on which source is chosen, the full dataset will be modified to return the required dataset

    # if LM is chosen as source, then the returned dataset will contain only the words that were originally from the LM dataset with LM_ added as prefix to the columns
    if source == self.lm_source:
      prefix = self.lm_prefix
      column = f'{self.lm_prefix}{self.source_column}'
      opposite_column = f'{self.tm_prefix}{self.source_column}'

      dataset_source = self.lm_source
      opposite_dataset_source = self.tm_source
    else:
      # if OUR_WORDS is chosen as source, then the returned dataset will contain only the words that were originally from our words dataset with TM_ added as prefix to the columns
      prefix = self.tm_prefix
      column = f'{self.tm_prefix}{self.source_column}'
      opposite_column = f'{self.lm_prefix}{self.source_column}'

      dataset_source = self.tm_source
      opposite_dataset_source = self.lm_source

    source_dataset = dataset[(dataset[column] == dataset_source) & ((dataset[opposite_column] == dataset_source) | (dataset[opposite_column] == opposite_dataset_source))]

    # filtering so just the necessary columns will remain
    if only_source_columns:
      columns = list(source_dataset.columns)
      source_columns = [self.word_column] + [column for column in columns if prefix in column]

      return source_dataset[source_columns]

    return source_dataset

  def calculate_score_both_dataset_sources(self, word_occurence):
    tm_accumulated_score, tm_opposite_accumulated_score = self.calculate_dataset_source_score(word_occurence, self.tm_prefix)
    lm_accumulated_score, lm_opposite_accumulated_score = self.calculate_dataset_source_score(word_occurence, self.lm_prefix)

    return tm_accumulated_score, tm_opposite_accumulated_score, lm_accumulated_score, lm_opposite_accumulated_score

  def calculate_score_one_dataset_source(self, words, prefix):
    accumulated_score, opposite_accumulated_score = 0, 0

    for word in words:
      # it can only have one occurence, but to check if it occurs at all
      word_occurences = self.dataset.loc[self.dataset[self.word_column] == word].values
      if len(word_occurences) == 0:
        continue

      word_occurence = word_occurences[0]

      decision_scores = self.calculate_dataset_source_score(word_occurence, prefix)
      accumulated_score += decision_scores[0]
      opposite_accumulated_score += decision_scores[1]

    return accumulated_score, opposite_accumulated_score

  def predict_sentence_label(self, sentence, label_t):
    tm_words = self.clean_and_lemmatize_words(self.tokenizer.tokenize(sentence))
    lm_words = self.clean_and_lemmatize_words(word_tokenize(sentence))

    tm, tmo = self.calculate_score_one_dataset_source(tm_words, self.tm_prefix)

    lm, lmo = self.calculate_score_one_dataset_source(lm_words, self.lm_prefix)

    return tm, tmo, lm, lmo

  def __get_wordnet_pos(self, word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

  def clean_and_lemmatize_words(self, words):
    lower_case_words = [str(word).lower().replace('ġ', '').strip() for word in words]
    return [self.lemmatizer.lemmatize(word, self.__get_wordnet_pos(word)) for word in lower_case_words]

  def calculate_word_dm_score(self, word_occurence, decision_maker_column, count_column):
    columns = list(self.dataset.columns)

    decision_maker_index = columns.index(decision_maker_column)

    value = word_occurence[decision_maker_index]

    return value

  def calculate_dataset_source_score(self, word_occurence, column_prefix):
    columns = list(self.dataset.columns)
    category_index = columns.index(f'{column_prefix}{self.category}')

    word_category = word_occurence[category_index]

    selected_category_sign = 1
    opposite_category_sign = 1
    if word_category == self.positive_category_value:
      opposite_category_sign = -1
    elif word_category == self.negative_category_value:
      selected_category_sign = -1

    if selected_category_sign == opposite_category_sign:
      return 0, 0

    opposite_column_prefix = f'{column_prefix}{self.opposite_prefix}'

    selected_category_score = 0
    opposite_category_score = 0
    for decision_maker in self.decision_makers:
      selected_category_score += self.calculate_word_dm_score(word_occurence, f'{column_prefix}{decision_maker}', f'{column_prefix}{self.count_column}') * selected_category_sign
      opposite_category_score += self.calculate_word_dm_score(word_occurence, f'{opposite_column_prefix}{decision_maker}', f'{opposite_column_prefix}{self.count_column}') * opposite_category_sign

    return selected_category_score, opposite_category_score

  def calculate_model_accuracy(self, true_labels, predicted_labels):
    accuracy_indicators = [true_label == predicted_label for true_label, predicted_label in zip(true_labels, predicted_labels)]

    return np.asarray(accuracy_indicators).sum() / len(true_labels)


  def predict_and_evaluate(self, sentences, true_labels):

    calculated_scores = [self.predict_sentence_label(sentence, label) for sentence, label in zip(sentences, true_labels)]

    return calculated_scores

In [None]:
import sys
import pytz

def create_summary_results(lexicon_source, lexicon_datasets, evaluation_datasets, drive_loc):

  summary_df_items = []
  for lexicon_name in lexicon_datasets:
    lexicon = lexicon_datasets[lexicon_name]
    lexicon_normalized = 'normalized' in lexicon_name

    for evaluate_dataset_name in evaluation_datasets:
      evaluate_dataset = evaluation_datasets[evaluate_dataset_name]
      sentences = evaluate_dataset['text'].values
      true_labels = evaluate_dataset['sentiment'].values

      dataset = lexicon.copy(True)

      lexicon_type = 'normalized' if lexicon_normalized else 'merged'

      evaluation_result = evaluate(dataset, sentences, true_labels, lexicon_source, lexicon_normalized, evaluate_dataset_name)
      summary_df_items = summary_df_items + evaluation_result

  summary_df = pd.DataFrame(summary_df_items, columns = ['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset',
                                                         'Sentence', 'True Label', 'XLex AS', 'XLex OAS', 'LM AS', 'LM OAS'])

  summary_df.to_csv(f'{drive_loc}/summary_df.csv', index=False)

  return summary_df

def run_shap_dict_model(dataset, sentences, true_labels, dataset_source = None):
  shap_dict_model = ShapDictModel(dataset, tokenizer, 'word', 'category', ['average_shap_values'], 'count', dataset_source = dataset_source)
  result = shap_dict_model.predict_and_evaluate(sentences, true_labels)

  return result

def evaluate(dataset, sentences, true_labels, lexicon_source, lexicon_name, evaluate_dataset_name):
  results = run_shap_dict_model(dataset, sentences, true_labels)

  res_df_list = []
  for sentence, label, res in zip(sentences, true_labels, results):
    new_entry = [lexicon_source, lexicon_name, evaluate_dataset_name, sentence, label] + list(res)
    res_df_list.append(new_entry)

  return res_df_list

In [None]:
import itertools
from itertools import combinations
from datetime import datetime

def calc_metrics(true_labels, predicted_labels):
  results = []

  acc = accuracy_score(true_labels, predicted_labels)
  results.append(acc)

  pr = precision_score(true_labels, predicted_labels, average="macro")
  results.append(pr)

  rec = recall_score(true_labels, predicted_labels, average="macro")
  results.append(rec)

  f1 = f1_score(true_labels, predicted_labels, average="macro")
  results.append(f1)

  mcc = matthews_corrcoef(true_labels, predicted_labels)
  results.append(mcc)

  cl_report = classification_report(true_labels, predicted_labels, zero_division=0)

  conf_matrix = confusion_matrix(true_labels, predicted_labels)

  return results


def calc_label(decision_values):
  return [1 if dv > 0 else 0 if dv < 0 else -1 for dv in decision_values]

def calc_segment(coefs, df):
  c1, c2, c3, c4 = coefs

  xlex_decision_score = c1 * df['XLex AS'] + c2 * df['XLex OAS']
  lm_decision_score =  c3 * df['LM AS'] + c4 * df['LM OAS']

  return calc_label(lm_decision_score), calc_label(xlex_decision_score), calc_label(xlex_decision_score + lm_decision_score)


def calc_row(coefs, dff, version, eval_df, extension = ''):
  types = ['LMD', 'OUR', 'OUR + LMD']
  true_labels = dff['True Label']
  lex_name = dff['Lexicon Source'].values[0]
  coef_df_values = []

  whole_dataset = calc_segment(coefs, dff)
  for wd, t in zip(whole_dataset, types):
    metrics = calc_metrics(true_labels, wd)
    new_row = [lex_name, version, eval_df, f'{t}{extension}', 'average_shap_values'] + coefs + metrics
    coef_df_values.append(new_row)

  return coef_df_values

def calc_version(df, coefs, eval_dfs, version):
  coef_df_values = []

  for eval_df in eval_dfs:
    dff = df[(df['Lexicon Normalized'] == version) & (df['Evaluation Dataset'] == eval_df)].copy(True).reset_index(drop=True)
    coef_df_values = coef_df_values + calc_row(coefs, dff, version, eval_df)

    dff_on_lm = dff[(dff['LM AS'] != 0) | (dff['LM OAS'] != 0)]
    coef_df_values = coef_df_values + calc_row(coefs, dff_on_lm, version, eval_df, extension = ' on LMD')

  return coef_df_values

def calc_df(coefs, df):
  versions = df['Lexicon Normalized'].unique()
  eval_dfs = df['Evaluation Dataset'].unique()

  coef_df_values = []

  for version in versions:
    coef_df_values = coef_df_values + calc_version(df, coefs, eval_dfs, version)

  return coef_df_values

def get_coefs_res(df, coefs):
  columns = ['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset', 'Words Source',
             'Decision Maker', 'C1', 'C2', 'C3', 'C4', 'Accuracy', 'Precision', 'Recall', 'F1', 'MCC']

  dff = df.copy(True)

  coefs_map = {}

  for coef in coefs:
    coef_df_values = calc_df(coef, dff)
    coefs_map[str(coef)] = pd.DataFrame(coef_df_values, columns = columns)

  return coefs_map


def get_metric_values(df, eval_df, normalized, metric):
  word_sources = ['LMD', 'OUR', 'OUR + LMD', 'LMD on LMD', 'OUR on LMD', 'OUR + LMD on LMD']
  all_metric_values = []

  for ws in word_sources:
    eval_df_mask = df['Evaluation Dataset'] == eval_df
    lexicon_normalized_mask = df['Lexicon Normalized'] == normalized
    word_source_mask = df['Words Source'] == ws

    combined_mask = eval_df_mask & lexicon_normalized_mask & word_source_mask

    metric_value = df[combined_mask][metric].values[0]

    all_metric_values.append(metric_value)

  return all_metric_values

def create_summary_dataset(df, metric):
  source_df = df['Lexicon Source'].unique()[0]
  eval_dfs = df['Evaluation Dataset'].unique()
  normalized = True
  coefs = list(df.loc[0, ['C1', 'C2', 'C3', 'C4']])
  decision_maker = 'average_shap_values'

  summary_df_values = []

  for ed in eval_dfs:

    for n in [normalized, not normalized]:
      metric_values = get_metric_values(df, ed, n, metric)
      row_value = [source_df, n, ed, decision_maker] + coefs + metric_values
      summary_df_values.append(row_value)

  cols = ['Lexicon Source', 'Lexicon Normalized', 'Evaluation Dataset', 'Decision Maker', 'C1', 'C2', 'C3', 'C4',
          'LM', 'XLex', 'XLex + LM', 'LM on LM', 'XLex on LM', 'XLex + LM on LM']

  return pd.DataFrame(summary_df_values, columns = cols)


def coefficient_permutations(coefs, coefs_number):
  return [p for p in itertools.product(coefs, repeat=coefs_number)]

def calc_metric_acc_ratio(dff):
  cols = ['XLex', 'XLex + LM', 'XLex on LM', 'XLex + LM on LM']
  cols_lm = ['LM', 'LM', 'LM on LM', 'LM on LM']

  res = []
  sum_res = []

  for c, c_lm in zip(cols, cols_lm):
    res.append((dff[c] >= dff[c_lm]).sum())


    sum_res.append(dff[c].sum())
    sum_res.append((dff[c] - dff[c_lm]).sum())

    sum_res.append(dff[c].mean())
    sum_res.append((dff[c] - dff[c_lm]).mean())

  return res, sum_res

def make_str(nums, total):
  res = []

  for num in nums:
    res.append(f'{str(num)}/{str(total)}')

  return res

def calc_perc(nums, total):
  res = []

  for num in nums:
    perc = (num / total) * 100
    res.append(perc)

  return res

def calc_metrics_ratio(dff):
  metrics = ['Accuracy', 'Precision', 'Recall', 'F1', 'MCC']
  rows = []

  for metric in metrics:
    df_metrics = pd.DataFrame()

    summary_dataset = create_summary_dataset(dff, metric)
    df_metrics = pd.concat([df_metrics, summary_dataset], ignore_index = True)

    acc_ratios, sum_acc_ratios = calc_metric_acc_ratio(df_metrics)
    total = len(df_metrics)

    init_cols = df_metrics.loc[0, ['Lexicon Source', 'C1', 'C2', 'C3', 'C4']]

    new_row = [init_cols[0]] + [metric] + list(init_cols[1:]) + make_str(acc_ratios, total) + calc_perc(acc_ratios, total) + sum_acc_ratios
    rows.append(new_row)

  return rows

def calc_accuracy_ratio(df, coeffs):
  cols = ['Lexicon Source', 'Metric', 'C1', 'C2', 'C3', 'C4', 'XLex', 'XLex + LM', 'XLex on LM', 'XLex + LM on LM',
          '% XLex', '% XLex + LM', '% XLex on LM', '% XLex + LM on LM',
          'XLex Sum Abs Value', 'XLex Sum Diff Value', 'XLex Avg Abs Value', 'XLex Avg Diff Value',
          'XLex + LM Sum Abs Value', 'XLex + LM Sum Diff Value', 'XLex + LM Avg Abs Value', 'XLex + LM Avg Diff Value',
          'XLex on LM Sum Abs Value', 'XLex on LM Sum Diff Value', 'XLex on LM Avg Abs Value', 'XLex on LM Avg Diff Value',
          'XLex + LM on LM Sum Abs Value', 'XLex + LM on LM Sum Diff Value', 'XLex + LM on LM Avg Abs Value', 'XLex + LM on LM Avg Diff Value']
  coefs_map = get_coefs_res(df, coeffs)
  all_coef_dfs = list(coefs_map.values())

  rows = []

  for coef_df in all_coef_dfs:
    new_rows = calc_metrics_ratio(coef_df)

    rows = rows + new_rows

  return pd.DataFrame(rows, columns = cols)

# Create Raw Results

In [None]:
import os
from os import listdir
from os.path import isfile, join
import pandas as pd

def extract_file_name(file_loc):
  return file_loc.split('/')[-1].split('.')[0]

def extract_datasets_map(datasets_location):
  location = datasets_location if datasets_location[-1] == '/' else f'{datasets_location}/'
  files_locations = [join(location, f) for f in listdir(location) if isfile(join(location, f))]

  print(f'Reading datasets from: {location} ...')

  assert files_locations != 0, 'No files found in the provided location'

  datasets_map = {}
  for f in files_locations:
    print(f'Reading dataset: {f} ...')
    dataset = pd.read_csv(f)
    datasets_map[extract_file_name(f)] = dataset

  print(f'Reading datasets successfully finished ...')

  return datasets_map


def create_results_folder(loc):
  parent_location = os.path.abspath(os.path.join(loc, os.pardir))
  mod_location = parent_location if parent_location[-1] == '/' else f'{parent_location}/'

  results_location = f'{mod_location}raw results'

  if not os.path.exists(results_location):
    os.makedirs(results_location)

  print(f'Created results dataset on location: {results_location} ...')

  return results_location

In [None]:
from transformers import BertTokenizer

# Load FinBERT-Tone tokenizer
tokenizer = BertTokenizer.from_pretrained('yiyanghkust/finbert-tone')

raw_results_folder_map = {}

for lexicon_name in lexicons_folder_map:
  lexicon_folder_loc = lexicons_folder_map[lexicon_name]

  lexicon_datasets_map = extract_datasets_map(lexicon_folder_loc)

  eval_datasets_map = extract_datasets_map(eval_datasets_folder_loc)

  results_folder_loc = create_results_folder(lexicon_folder_loc)

  raw_results_folder_map[lexicon_name] = results_folder_loc

  if lexicon_name == 'fpb':
    filtered_dict = dict(filter(lambda item: item[0] != 'financial_phrase_bank', eval_datasets_map.items()))
  else:
    filtered_dict = eval_datasets_map

  # creating raw results which will later be used to find the most suitable coefficients
  df = create_summary_results(lexicon_name, lexicon_datasets_map, filtered_dict, results_folder_loc)

# Metric grid search from raw results

In [None]:
summary_raw_results_map = {}

for lex_name in raw_results_folder_map:
  raw_result_folder = raw_results_folder_map[lex_name]
  summary_raw_results_map[lex_name] = f'{raw_result_folder}/summary_df.csv'

In [None]:
coefs = coefficient_permutations([0.1, 0.3, 0.5, 0.7, 0.9], 3)
full_coefs = []
for c in coefs:
  full_coefs.append(list(c) + [0.5])


metrics_grid_search_loc_map = {}

merged_df = pd.DataFrame()

for lexicon_source in summary_raw_results_map:
  summary_raw_results_loc = summary_raw_results_map[lexicon_source]

  dff = pd.read_csv(summary_raw_results_loc)

  full_dff = calc_accuracy_ratio(dff, full_coefs)

  merged_df = pd.concat([merged_df, full_dff], ignore_index = True)

  raw_result_folder_loc = raw_results_folder_map[lexicon_source]
  parent_location = os.path.abspath(os.path.join(raw_result_folder_loc, os.pardir))
  mod_location = parent_location if parent_location[-1] == '/' else f'{parent_location}/'

  results_location = f'{mod_location}metrics_grid_search.csv'
  full_dff.to_csv(results_location, index=False)

  metrics_grid_search_loc_map[lexicon_source] = results_location

best_coef_results_final_loc = best_coef_results_loc if best_coef_results_loc[-1] == '/' else f'{best_coef_results_loc}/'
merged_df.to_csv(f'{best_coef_results_final_loc}best_coefs_all_dfs_summary.csv', index=False)

# Finding the best performance coefficients

In [None]:
def calc_xlex_lm_combs(dataset, metrics, agg_comb):
  l = []

  for comb in xlex_lm_combinations:
    s = 0
    for c in comb:
      s += calc_for_metrics(dataset, metrics, f'{c} {agg_comb} {value_end}')

    l.append(s)

  return l

def calc_for_metrics(dataset, metrics, comb):
  s = 0

  for metric in metrics:
    for source in lexicon_sources:
      s += dataset[(dataset['Lexicon Source'] == source) & (dataset['Metric'] == metric)][comb].values[0]

  return s

def calc_best_options(dataset):
  list_values = []

  for coef in coefs:
    coef_str = ','.join(str(value) for value in coef)
    coef_dataset = dataset[(dataset['C1'] == coef[0]) & (dataset['C2'] == coef[1]) & (dataset['C3'] == coef[2]) & (dataset['C4'] == coef[3])].copy(True)

    for agg_comb in agg_combinations:
      for metrics, metric_type in zip([accuracy_metrics, primary_metrics, all_metrics], ['Accuracy', 'Primary', 'All']):
        xlex_lm_combs_res = calc_xlex_lm_combs(coef_dataset, metrics, agg_comb)
        new_row = [coef_str, agg_comb, metric_type] + xlex_lm_combs_res
        list_values.append(new_row)

  columns = ['Coefs', 'Agg Comb', 'Metric Type', 'Xlex, XLex + LM', 'XLex + LM,XLex + LM on LM', 'Xlex,XLex + LM,XLex + LM on LM', 'Xlex,XLex + LM,Xlex on LM,XLex + LM on LM']

  return pd.DataFrame(list_values, columns = columns)

In [None]:
import pandas as pd

best_coef_results_final_loc = best_coef_results_loc if best_coef_results_loc[-1] == '/' else f'{best_coef_results_loc}/'
all_coefs_summary = pd.read_csv(f'{best_coef_results_final_loc}best_coefs_all_dfs_summary.csv')

In [None]:
coefs = full_coefs

accuracy_metrics = ['Accuracy']
primary_metrics = ['Accuracy', 'F1', 'MCC']
all_metrics = primary_metrics + ['Precision', 'Recall']

agg_combinations = ['Sum Abs', 'Sum Diff', 'Avg Abs', 'Avg Diff']
value_end = 'Value'
xlex_lm_combinations = [['XLex + LM', 'XLex'], ['XLex + LM', 'XLex + LM on LM'], ['XLex + LM', 'XLex + LM on LM', 'XLex'], ['XLex + LM', 'XLex + LM on LM', 'XLex', 'XLex on LM']]
lexicon_sources = ['nasdaq', 'fpb', 'sentfin']


new_df = calc_best_options(all_coefs_summary)
new_df.to_csv(f'{best_coef_results_final_loc}final_coef_metric_evaluation.csv')

In [None]:
metric_types = ['Primary']
agg_types = ['Avg Abs']
s_values = [['Xlex, XLex + LM']]

coef_map = {}

for s in s_values:
  for agg in agg_types:
    for metr in metric_types:
      comb = new_df[(new_df['Metric Type'] == metr) & (new_df['Agg Comb'] == agg)].sort_values(by = s, ascending=[False] * len(s)).head(1).values[0][0]
      if comb in coef_map:
        coef_map[comb] = coef_map[comb] + 1
      else:
        coef_map[comb] = 1

In [None]:
coef_map