In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import cross_val_predict
import os

In [2]:
filepath_dict = {'random_labels':   'processed_text/randomly_labelled_data.txt',
                 'labelled_data':   'processed_text/labelled_data.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['citation', 'sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)

#Make sure it loaded properly
print(df.iloc[0])

citation    urn:cts:greekLit:tlg0032.tlg006.perseus-grc2@1...
sentence    Δαρείου καὶ Παρυσάτιδος γίγνονται παῖδες δύο, ...
label                                                       1
source                                          random_labels
Name: 0, dtype: object


In [3]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    citations = df_source['citation'].values
    y = df_source['label'].values

    
    print('Converting '+source+' text to vectors...')
    vectorizer = CountVectorizer()
    vectorizer.fit(sentences)
    vectors = vectorizer.transform(sentences)
    
    classifier = LogisticRegression(max_iter = 1000)
    
    print('Generating cross-validation predictions from '+source+' data...')
    predictions = cross_val_predict(classifier, vectors, y, cv = 10, n_jobs = -1)
    
    print('Generating confusion matrix for '+source+' predictions...')
    cm = metrics.confusion_matrix(y, predictions)

    print(cm)   
    
    count = 0
    mistake_locations = []
    mistakes = []
    for val in predictions:
        if y[count] != val:
            mistake_locations.append(count)
                        
        count = count + 1
    
    #print(len(mistake_locations))
    
    for location in mistake_locations:
        mistakes.append(sentences[location])
        
    #print(len(mistakes)) 
    
    if len(mistake_locations)==len(mistakes):
        print('Successfully generated mistake locations in '+source+'!')
        
    else:
        print('Oh, no! Something went wrong generating mistake locations in '+source+'!')
     
    mistake_citations = []
    for text in mistakes:
        mistake_citations.append(citations[sentences.tolist().index(text)])
    
    print('Successfully generated mistake citations for '+source+' text!')
    print('Here is a sample: '+mistake_citations[0])
    
    
    
    #Save the list of errors to a file *NOTE: CURRENTLY SET TO OVERWRITE ANY FILE WITH THE SAME NAME*
    cleanfile = open('processed_text/mistakes_from_'+source+'.txt', 'w', encoding="utf-8") 
    for item in mistake_citations:
        cleanfile.write(item+"\n")
        
    cleanfile.close()
    print('Successfully written to the processed_text folder!')
    print()

Converting random_labels text to vectors...
Generating cross-validation predictions from random_labels data...
Generating confusion matrix for random_labels predictions...
[[4599 5130]
 [4913 4973]]
Successfully generated mistake locations in random_labels!
Successfully generated mistake citations for random_labels text!
Here is a sample: urn:cts:greekLit:tlg0032.tlg006.perseus-grc2@1.1.1
Successfully written to the processed_text folder!

Converting labelled_data2 text to vectors...
Generating cross-validation predictions from labelled_data2 data...
Generating confusion matrix for labelled_data2 predictions...
[[ 8352   733]
 [  488 10042]]
Successfully generated mistake locations in labelled_data2!
Successfully generated mistake citations for labelled_data2 text!
Here is a sample: urn:cts:greekLit:tlg0032.tlg006.perseus-grc2@1.3.4
Successfully written to the processed_text folder!

