In [1]:
import pandas as pd
import os
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from nltk import ngrams
import datetime
import pprint
import numpy as np

In [2]:
pd.set_option('display.max_rows', 1200)
pd.set_option('display.max_columns', 30)

In [3]:
df = pd.read_excel('ESC12_HomicideDatabaseFinalEng.xlsx', sheet_name='ESC12')

## Manual data exploration

In [5]:
df[df['ESC12'] == 'MOT']['Name (Description)'].isna().sum()

21

In [6]:
df[df['ESC12'] == 'CO']['Name (Description)'].isna().sum()

3

Rename the 4 empty protagonists to "Unknown"

In [7]:
df.loc[(df['ESC12'] == 'CO') & (df['Name (Description)'].isna())]

Unnamed: 0,Case ID,ESC12,Entity type,Name (Description),Remarks,Time frame,RH,SYM,Unnamed: 8,Country,City,Static location,Unnamed: 12,Begin,End,Unnamed: 15,Gender,Age
498,48.0,CO,?,,,2009-11-20 00:00:00,False,False,,,,,,,,,,
840,80.0,CO,?,,,,,,,,,,,,,,,
870,83.0,CO,?,,,,,,,,,,,,,,,


In [8]:
df.loc[(df['ESC12'] == 'PG') & (df['Name (Description)'] == 'Unknown')]

Unnamed: 0,Case ID,ESC12,Entity type,Name (Description),Remarks,Time frame,RH,SYM,Unnamed: 8,Country,City,Static location,Unnamed: 12,Begin,End,Unnamed: 15,Gender,Age
86,11.0,PG,Person,Unknown,Familie van de slachtoffers,2015-09-02 00:00:00,False,False,,,,,,,,,Male,20.0
215,23.0,PG,Person,Unknown,,2013-01-02 00:00:00,False,False,,,,,,,,,Female,59.0
841,80.0,PG,Person,Unknown,Niet bekend,,,,,,,,,,,,,
871,83.0,PG,Person,Unknown,,,,,,,,,,,,,,


In [9]:
df.loc[(df['ESC12'] == 'MEANS') & (df['Name (Description)'] == 'Unknown')]

Unnamed: 0,Case ID,ESC12,Entity type,Name (Description),Remarks,Time frame,RH,SYM,Unnamed: 8,Country,City,Static location,Unnamed: 12,Begin,End,Unnamed: 15,Gender,Age
1012,96.0,MEANS,Weapon,Unknown,,2016-02-04 00:00:00,False,False,,,,,,,,,,
1044,99.0,MEANS,Weapon,Unknown,,,,,,,,,,,,,,
1054,100.0,MEANS,Weapon,Unknown,,,,,,,,,,,,,,


In [10]:
df.loc[(df['ESC12'] == 'M.O.') & (df['Name (Description)'].isna())]

Unnamed: 0,Case ID,ESC12,Entity type,Name (Description),Remarks,Time frame,RH,SYM,Unnamed: 8,Country,City,Static location,Unnamed: 12,Begin,End,Unnamed: 15,Gender,Age
60,8.0,M.O.,Action,,Jan R. komt terug van een kerkdienst als hij d...,2015-09-20 00:00:00,False,False,,,,,,,,,,
70,9.0,M.O.,Action,,Engin C. heeft op enkele meters van Emad vanda...,2005-11-13 00:00:00,False,False,,,,,,,,,,
81,10.0,M.O.,Action,,Voor de woning van Kees Houtman werd hij neerg...,2005-11-02 00:00:00,False,False,,,,,,,,,,
876,83.0,M.O.,,,,,,,,,,,,,,,,
907,86.0,M.O.,Action,,Sara G. was een natuurgenezeres. Carmen was ee...,2017-04-12 00:00:00,True,False,,,,,,,,,,
981,93.0,M.O.,,,,,,,,,,,,,,,,
1045,99.0,M.O.,,,,,,,,,,,,,,,,


In [12]:
df.loc[(df['ESC12'] == 'AR')]['City'].unique()

array(['Leiden', 'Veenklooster', 'Alphen aan den Rijn',
       'Capelle aan den Ijssel', 'Hoogerheide', 'Zoetermeer', 'Nieuwstad',
       'Osdorp', 'Veghel', 'Amsterdam', 'Zeewolde', 'Brunssummerheide',
       'Hilversum', 'Poortugaal', 'Maastricht', 'Achterveld',
       'Bunschoten', 'Abbertbos', 'Roermond', 'Assen', 'Ijsselstein',
       'Doorn', 'Baarn', 'Erp', 'Zwijndrecht', 'Schiedam', 'Enschede',
       'Nijmegen', 'Spijkenisse', 'Rotterdam', 'Putten', 'Brunssum',
       'Loosduinen', 'Laren', 'Utrecht', 'Tübbern', 'Sittard',
       'Wageningen', 'Almere', 'Nieuw-Vennep', 'Leeuwarden', 'Urk',
       'Groningen', 'Dordrecht', 'Zevenbergen', 'Amersfoort', 'Hengelo',
       'Arnhem', 'Wilvervank', 'Marum', 'Oosterbeek', 'Geldrop',
       'Voorburg', 'Rotterdam-West', 'Bemmel', 'Enkhuizen',
       'Vortum-Mullem', 'Kaatsheuvel', 'Beverwijk', 'Landsmeer',
       'Bilthoven', 'Winterswijk', 'Vlaardingen', 'Amsterdam ',
       'Rotterdam-Beverwaard', 'Mierlo', 'Zeeheldenkwartier', 'Rill

### There are typos in some of the cities. Manually fix these in the excel file. For example:

- Alpen aan den Rijn should be Alphen aan den Rijn (fix this in the corpora as well, the aan den was wrong. searched for "a / d" -> replace with aan den
- Cappele aan den Ijssel should be Capelle aan den IJssel

# Custom Evaluation Metric code

In [13]:
MODEL_OUTPUT_DIR = r'C:\Thesis\homicide_models_outputs'

In [14]:
HOMICIDE_INPUT_DIR = r'C:\Thesis\homicide_corpora'

In [15]:
# Skip these cases, because of various missing ESC labels
skip_ids = [6,7,8,9,10,11,23,48,80,83,86,93,96,99,100]

In [24]:
class Case:
    def __init__(self, ID):
        self.ID = ID
        self.AR = []
        self.AR_score = []
        self.ar_score_avg = 0
        self.TF = []
        self.TF_score = []
        self.tf_score_avg = 0
        self.PG = []
        self.PG_score = []
        self.pg_score_avg = 0
        self.AG = []
        self.AG_score = []
        self.ag_score_avg = 0
        self.MO = []
        self.MO_score = []
        self.mo_score_avg = 0
        self.final_score = 0
        self.final_score_relative = 0

In [25]:
# Function to extract the ESCs from a given corpus by using the Fuzzy token sort ratio method.
# Returns a list of cases
def generate_cases(corpus):
    ESC_corpus = []
    
    for i, corpus_case in enumerate(corpus):
        if((i+1) in skip_ids):
            continue
            
        case = Case(i+1)

        arenas = df.loc[(df['Case ID'] == float(i+1)) & (df['ESC12'] == 'AR')]['City'].tolist()
        arenas = [x.lower().strip() for x in arenas]
        case.AR = arenas[:]
        for a in arenas:
            n_gram_size = len(a.split())
            n_gram_case = list(ngrams(corpus_case.split(), n_gram_size))
            joined_n_gram_case = [' '.join(x) for x in n_gram_case]
            result = process.extractOne(a, joined_n_gram_case, scorer=fuzz.token_sort_ratio)
            case.AR_score.append(result)

        dates = df.loc[(df['Case ID'] == float(i+1)) & (df['ESC12'] == 'TF')]['Name (Description)'].tolist()
        case.TF = dates[:]
        for d in dates:
            if(isinstance(d, datetime.datetime)):
                date_str = d.strftime('%Y')
                split_case = corpus_case.split()
                result = process.extractOne(date_str, split_case, scorer=fuzz.token_sort_ratio)
                case.TF_score.append(result)

        protagonists = df.loc[(df['Case ID'] == float(i+1)) & (df['ESC12'] == 'PG')]['Name (Description)'].tolist()
        protagonists = [x.lower().strip() for x in protagonists]
        case.PG = protagonists[:]
        for pg in protagonists:
            n_gram_size = len(pg.split())
            n_gram_case = list(ngrams(corpus_case.split(), n_gram_size))
            joined_n_gram_case = [' '.join(x) for x in n_gram_case]
            result = process.extractOne(pg, joined_n_gram_case, scorer=fuzz.token_sort_ratio)
            case.PG_score.append(result)

        antagonists = df.loc[(df['Case ID'] == float(i+1)) & (df['ESC12'] == 'AG')]['Name (Description)'].tolist()
        antagonists = [x.lower().strip() for x in antagonists]
        case.AG = antagonists[:]
        for ag in antagonists:
            n_gram_size = len(ag.split())
            n_gram_case = list(ngrams(corpus_case.split(), n_gram_size))
            joined_n_gram_case = [' '.join(x) for x in n_gram_case]
            result = process.extractOne(ag, joined_n_gram_case, scorer=fuzz.token_sort_ratio)
            case.AG_score.append(result)

        mo = df.loc[(df['Case ID'] == float(i+1)) & (df['ESC12'] == 'M.O.')]['Name (Description)'].tolist()
        mo_list = [x.split(',') for x in mo]
        mo_list = [y.lower().strip() for x in mo_list for y in x]
        case.MO = mo_list[:]
        for mo in mo_list:
            n_gram_size = len(mo.split())
            n_gram_case = list(ngrams(corpus_case.split(), n_gram_size))
            joined_n_gram_case = [' '.join(x) for x in n_gram_case]
            result = process.extractOne(mo, joined_n_gram_case, scorer=fuzz.token_sort_ratio)
            case.MO_score.append(result)

        ESC_corpus.append(case)
    
    return ESC_corpus

In [26]:
# Function to calculate the average scores for each component in the homicide input data
# The scores are stored in the input data case objects
def calculate_corpus_scores(corpus):
    for corpus_case in corpus:
        discard_case = False

        if(all(ar_score[1] >= 65 for ar_score in corpus_case.AR_score)):
            corpus_case.ar_score_avg = sum([ar_score[1] for ar_score in corpus_case.AR_score]) / len(corpus_case.AR_score)
        else:
            discard_case = True
            #print(str(corpus_case.ID) + " arena: " + str(corpus_case.AR_score))

        if(all(tf_score[1] >= 75 for tf_score in corpus_case.TF_score)):
            corpus_case.tf_score_avg = sum([tf_score[1] for tf_score in corpus_case.TF_score]) / len(corpus_case.TF_score)
            # if the average score of the arenas is below 85 we discard the case
            if(corpus_case.tf_score_avg < 85):
                discard_case = True
        else:
            discard_case = True

        if(all(pg_score[1] >= 70 for pg_score in corpus_case.PG_score)):
            corpus_case.pg_score_avg = sum([pg_score[1] for pg_score in corpus_case.PG_score]) / len(corpus_case.PG_score)
        else:
            discard_case = True

        if(all(ag_score[1] > 75 for ag_score in corpus_case.AG_score)):
            corpus_case.ag_score_avg = sum([ag_score[1] for ag_score in corpus_case.AG_score]) / len(corpus_case.AG_score)
        else:
            discard_case = True
            
        if(all(mo_score[1] >= 75 for mo_score in corpus_case.MO_score)):
            corpus_case.mo_score_avg = sum([mo_score[1] for mo_score in corpus_case.MO_score]) / len(corpus_case.MO_score)
        else:
            discard_case = True

        if(discard_case == False):
            corpus_case.final_score = (corpus_case.ar_score_avg + \
                                       corpus_case.tf_score_avg + \
                                       corpus_case.pg_score_avg + \
                                       corpus_case.ag_score_avg + \
                                       corpus_case.mo_score_avg) / 5
        else:
            corpus_case.final_score = -1

In [27]:
# Function to calculate the relative ESC scores based on the cases in the summaries vs the cases in the input texts
# The relative scores are saved in the case objects for the input texts
def calculate_model_scores(homicide_corpus, model_corpus):
    for index, (corpus_case, model_case) in enumerate(zip(homicide_corpus, model_corpus)):
        if(corpus_case.final_score == -1):
            model_case.final_score_relative = -1
            continue

        model_case.ar_score_avg = sum([ar_score[1] for ar_score in model_case.AR_score]) / len(model_case.AR_score)
        if(model_case.ar_score_avg < 65):
            model_case.ar_score_avg = 0

        model_case.tf_score_avg = sum([tf_score[1] for tf_score in model_case.TF_score]) / len(model_case.TF_score)
        if(model_case.tf_score_avg < 75):
            model_case.tf_score_avg = 0

        model_case.pg_score_avg = sum([pg_score[1] for pg_score in model_case.PG_score]) / len(model_case.PG_score)
        if(model_case.pg_score_avg < 70):
            model_case.pg_score_avg = 0

        model_case.ag_score_avg = sum([ag_score[1] for ag_score in model_case.AG_score]) / len(model_case.AG_score)
        if(model_case.ag_score_avg < 75):
            model_case.ag_score_avg = 0

        model_case.mo_score_avg = sum([mo_score[1] for mo_score in model_case.MO_score]) / len(model_case.MO_score)
        if(model_case.mo_score_avg < 70):
            model_case.mo_score_avg = 0

        model_case.final_score = (model_case.ar_score_avg + model_case.tf_score_avg + model_case.pg_score_avg + \
                                    model_case.ag_score_avg + model_case.mo_score_avg) / 5
        
        model_case.final_score_relative = ((model_case.ar_score_avg / corpus_case.ar_score_avg) + 
                                            (model_case.tf_score_avg / corpus_case.tf_score_avg) + 
                                            (model_case.pg_score_avg / corpus_case.pg_score_avg) + 
                                            (model_case.ag_score_avg / corpus_case.ag_score_avg) + 
                                            (model_case.mo_score_avg / corpus_case.mo_score_avg)) / 5

In [28]:
selection_size = ['2', '3', '4', '5']
truncation_size = ['500', '1000']
models = ['himap', 'transformer', 'textrank']
[(x,y,z) for y in truncation_size for x in selection_size for z in models]

[('2', '500', 'himap'),
 ('2', '500', 'transformer'),
 ('2', '500', 'textrank'),
 ('3', '500', 'himap'),
 ('3', '500', 'transformer'),
 ('3', '500', 'textrank'),
 ('4', '500', 'himap'),
 ('4', '500', 'transformer'),
 ('4', '500', 'textrank'),
 ('5', '500', 'himap'),
 ('5', '500', 'transformer'),
 ('5', '500', 'textrank'),
 ('2', '1000', 'himap'),
 ('2', '1000', 'transformer'),
 ('2', '1000', 'textrank'),
 ('3', '1000', 'himap'),
 ('3', '1000', 'transformer'),
 ('3', '1000', 'textrank'),
 ('4', '1000', 'himap'),
 ('4', '1000', 'transformer'),
 ('4', '1000', 'textrank'),
 ('5', '1000', 'himap'),
 ('5', '1000', 'transformer'),
 ('5', '1000', 'textrank')]

In [29]:
homicide_scores_list = []
models_scores_list = []

In [30]:
for s, t, m in [(x,y,z) for y in truncation_size for x in selection_size for z in models]:
    homicide_corpus = []
    print("Loading " + (os.path.join(HOMICIDE_INPUT_DIR, "corpus_" + s + "_" + t)))
    with open(os.path.join(HOMICIDE_INPUT_DIR, "corpus_" + s + "_" + t), encoding='utf-8') as doc:
        homicide_corpus = [line for line in doc]
    
    model_corpus = []
    if(m != 'textrank'):
        MODEL = m + '_' + t + '_step_20000'
    else:
        MODEL = m + '_' + t
        
    print("Loading " + os.path.join(MODEL_OUTPUT_DIR, MODEL + "_" + "corpus_" + s + "_" + t + ".output"))
    with open(os.path.join(MODEL_OUTPUT_DIR, MODEL + "_" + "corpus_" + s + "_" + t + ".output"), encoding='utf-8') as doc:
        model_corpus = [line[2:] for line in doc]
        
    ESC_homicide_corpus = generate_cases(homicide_corpus)
    ESC_model_corpus = generate_cases(model_corpus)
    
    calculate_corpus_scores(ESC_homicide_corpus)
    calculate_model_scores(ESC_homicide_corpus, ESC_model_corpus)
    
    homicide_scores_list.append(ESC_homicide_corpus)
    models_scores_list.append(ESC_model_corpus)
    print()

Loading C:\Thesis\homicide_corpora\corpus_2_500
Loading C:\Thesis\homicide_models_outputs\himap_500_step_20000_corpus_2_500.output

Loading C:\Thesis\homicide_corpora\corpus_2_500
Loading C:\Thesis\homicide_models_outputs\transformer_500_step_20000_corpus_2_500.output

Loading C:\Thesis\homicide_corpora\corpus_2_500
Loading C:\Thesis\homicide_models_outputs\textrank_500_corpus_2_500.output

Loading C:\Thesis\homicide_corpora\corpus_3_500
Loading C:\Thesis\homicide_models_outputs\himap_500_step_20000_corpus_3_500.output

Loading C:\Thesis\homicide_corpora\corpus_3_500
Loading C:\Thesis\homicide_models_outputs\transformer_500_step_20000_corpus_3_500.output

Loading C:\Thesis\homicide_corpora\corpus_3_500
Loading C:\Thesis\homicide_models_outputs\textrank_500_corpus_3_500.output

Loading C:\Thesis\homicide_corpora\corpus_4_500
Loading C:\Thesis\homicide_models_outputs\himap_500_step_20000_corpus_4_500.output

Loading C:\Thesis\homicide_corpora\corpus_4_500
Loading C:\Thesis\homicide_model

In [32]:
combinations_list_model = [cases_list for cases_list in models_scores_list]
print(len(combinations_list_model))

24


In [33]:
model_scores_df = pd.DataFrame(list(zip(*[[y.final_score_relative for y in x] for x in [c for c in combinations_list_model]])), 
                          columns = ['2_500_himap', '2_500_transformer', '2_500_textrank', '3_500_himap', 
                                     '3_500_transformer', '3_500_textrank', '4_500_himap', '4_500_transformer', 
                                     '4_500_textrank', '5_500_himap', '5_500_transformer', '5_500_textrank',
                                     '2_1000_himap', '2_1000_transformer', '2_1000_textrank', '3_1000_himap', 
                                     '3_1000_transformer', '3_1000_textrank', '4_1000_himap', '4_1000_transformer', 
                                     '4_1000_textrank', '5_1000_himap', '5_1000_transformer', '5_1000_textrank'])

In [34]:
clean_id_list = [x for x in list(range(1,101)) if x not in skip_ids]

In [35]:
model_scores_df['ID'] = np.array(clean_id_list)

In [36]:
cols = ['ID']  + [col for col in model_scores_df if col != 'ID']
model_scores_df = model_scores_df[cols]

Keep only the cases that have a valid score for all model-selection-truncation combinations

In [37]:
model_scores_df_index = model_scores_df[(model_scores_df['2_500_himap'] == -1) | (model_scores_df['2_500_transformer'] == -1) | (model_scores_df['3_500_himap'] == -1) | 
                     (model_scores_df['3_500_transformer'] == -1) | (model_scores_df['4_500_himap'] == -1) | (model_scores_df['4_500_transformer'] == -1) | 
                     (model_scores_df['5_500_himap'] == -1) | (model_scores_df['5_500_transformer'] == -1) | (model_scores_df['2_1000_himap'] == -1) | 
                     (model_scores_df['2_1000_transformer'] == -1) | (model_scores_df['3_1000_himap'] == -1) | (model_scores_df['3_1000_transformer'] == -1) | 
                     (model_scores_df['4_1000_himap'] == -1) | (model_scores_df['4_1000_transformer'] == -1) | (model_scores_df['5_1000_himap'] == -1) |
                     (model_scores_df['5_1000_transformer'] == -1) | (model_scores_df['2_500_textrank'] == -1) | (model_scores_df['3_500_textrank'] == -1) | 
                     (model_scores_df['4_500_textrank'] == -1) | (model_scores_df['5_500_textrank'] == -1) | (model_scores_df['2_1000_textrank'] == -1) | 
                     (model_scores_df['3_1000_textrank'] == -1) | (model_scores_df['4_1000_textrank'] == -1) | (model_scores_df['5_1000_textrank'] == -1)].index

In [38]:
model_scores_df.drop(model_scores_df_index)

Unnamed: 0,ID,2_500_himap,2_500_transformer,2_500_textrank,3_500_himap,3_500_transformer,3_500_textrank,4_500_himap,4_500_transformer,4_500_textrank,5_500_himap,5_500_transformer,5_500_textrank,2_1000_himap,2_1000_transformer,2_1000_textrank,3_1000_himap,3_1000_transformer,3_1000_textrank,4_1000_himap,4_1000_transformer,4_1000_textrank,5_1000_himap,5_1000_transformer,5_1000_textrank
1,2,0.366138,0.785185,0.8,0.35873,0.772,1.0,0.6,0.741312,0.8,0.8,0.972,1.0,0.76,0.54,0.989,0.97672,0.8,1.0,0.57672,0.57672,1.0,0.369312,1.0,1.0
3,4,0.4,0.341333,0.779333,0.4,0.4,0.6,0.4,0.6,0.788667,0.6,0.6,0.765333,0.4,0.344667,0.779333,0.4,0.564,0.779333,0.4,0.4,0.779333,0.6,0.366667,0.757333
4,5,0.95,0.8,1.0,0.6,0.972,1.0,0.8,0.8,0.8,1.0,0.8,0.8,0.972,0.8,1.0,0.95,0.6,1.0,0.75,0.972,1.0,0.546,0.764,0.8
5,12,0.924,0.966,1.0,0.8,0.6,1.0,0.8,0.832558,1.0,0.8,0.766,1.0,0.938,0.566,0.972,0.738,0.566,0.972,0.928,0.998558,1.0,0.935767,0.8,1.0
7,14,0.4,0.934,0.8,0.4,1.0,1.0,0.4,0.6,1.0,0.572,1.0,1.0,0.8,0.6,0.8,0.8,0.734,0.8,0.8,0.96,0.8,0.8,0.894,0.8
8,15,1.0,0.56,1.0,0.8,0.772,1.0,0.75,0.8,1.0,0.8,0.734,1.0,1.0,0.8,1.0,0.8,0.75102,1.0,0.8,0.75102,1.0,0.8,0.71102,1.0
18,26,0.6,0.787912,0.8,0.794505,0.991892,1.0,0.794872,0.4,0.8,0.794872,0.784615,0.8,0.979487,0.632432,0.784615,0.784615,0.597297,0.8,0.994872,0.575385,0.8,0.594872,0.594872,0.8
23,31,0.2,1.0,0.8,0.4,0.8,0.8,0.4,0.6,0.6,0.0,0.8,0.6,0.8,0.4,0.942,0.6,1.0,0.8,0.8,1.0,0.8,0.8,0.6,0.6
25,33,0.6,0.8,1.0,0.795,0.535,0.975,0.77,0.775,0.975,0.572,0.775,0.995,0.975,0.738077,1.0,0.77,0.728,0.97,0.375,0.77,0.97,0.769,0.759,0.995
28,36,0.8,0.6,1.0,0.8,1.0,1.0,0.6,0.6,1.0,0.8,0.6,1.0,0.4,0.6,1.0,0.8,0.4,1.0,0.8,0.2,1.0,0.6,0.572,1.0


In [703]:
model_scores_df.drop(model_scores_df_index).to_excel('homicide_models_scores.xlsx')

In [41]:
combinations_list_homicide = [cases_list for cases_list in homicide_scores_list]
print(len(combinations_list_homicide))

24


In [42]:
corpus_scores_df = pd.DataFrame(list(zip(*[[y.final_score for y in x] for x in [combination for combination in combinations_list_homicide]])), 
                      columns = ['2_500_himap', '2_500_transformer', '2_500_textrank', '3_500_himap', 
                                 '3_500_transformer', '3_500_textrank', '4_500_himap', '4_500_transformer', 
                                 '4_500_textrank', '5_500_himap', '5_500_transformer', '5_500_textrank',
                                 '2_1000_himap', '2_1000_transformer', '2_1000_textrank', '3_1000_himap', 
                                 '3_1000_transformer', '3_1000_textrank', '4_1000_himap', '4_1000_transformer', 
                                 '4_1000_textrank', '5_1000_himap', '5_1000_transformer', '5_1000_textrank'])

In [43]:
corpus_scores_df['ID'] = np.array(clean_id_list)

In [44]:
cols = ['ID']  + [col for col in corpus_scores_df if col != 'ID']
corpus_scores_df = corpus_scores_df[cols]

In [45]:
corpus_scores_df_index = corpus_scores_df[(corpus_scores_df['2_500_himap'] == -1) | (corpus_scores_df['2_500_transformer'] == -1) | (corpus_scores_df['3_500_himap'] == -1) | 
                     (corpus_scores_df['3_500_transformer'] == -1) | (corpus_scores_df['4_500_himap'] == -1) | (corpus_scores_df['4_500_transformer'] == -1) | 
                     (corpus_scores_df['5_500_himap'] == -1) | (corpus_scores_df['5_500_transformer'] == -1) | (corpus_scores_df['2_1000_himap'] == -1) | 
                     (corpus_scores_df['2_1000_transformer'] == -1) | (corpus_scores_df['3_1000_himap'] == -1) | (corpus_scores_df['3_1000_transformer'] == -1) | 
                     (corpus_scores_df['4_1000_himap'] == -1) | (corpus_scores_df['4_1000_transformer'] == -1) | (corpus_scores_df['5_1000_himap'] == -1) |
                     (corpus_scores_df['5_1000_transformer'] == -1) | (corpus_scores_df['2_500_textrank'] == -1) | (corpus_scores_df['3_500_textrank'] == -1) | 
                     (corpus_scores_df['4_500_textrank'] == -1) | (corpus_scores_df['5_500_textrank'] == -1) | (corpus_scores_df['2_1000_textrank'] == -1) | 
                     (corpus_scores_df['3_1000_textrank'] == -1) | (corpus_scores_df['4_1000_textrank'] == -1) | (corpus_scores_df['5_1000_textrank'] == -1)].index

In [684]:
corpus_scores_df.drop(corpus_scores_df_index).to_excel('homicide_corpus_scores.xlsx')