In [6]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# Import a bunch of libraries.
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator
from sklearn.pipeline import Pipeline
from sklearn.datasets import fetch_openml
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Set the randomizer seed so results are the same each time.
np.random.seed(0)

import sklearn

In [7]:
data = pd.read_csv("data/derived_accused_cases_trials.csv")
data.head(3)

Unnamed: 0,AccusedRef,AccusedSystemId,AccusedID,FirstName,LastName,M_Firstname,M_Surname,Alias,Patronymic,DesTitle,...,Exec_county,Exec_burgh,Exec_NGR_Letters,Exec_NGR_Easting,Exec_NGR_Northing,PostTrialNotes,Createdby_y,Createdate_y,Lastupdatedby_y,Lastupdatedon_y
0,A/EGD/10,EGD,10.0,Mareon,Quheitt,Marion,White,,,,...,,,,,,,jhm,08/07/02 15:23:55,LEM,10/28/02 13:06:53
1,A/EGD/100,EGD,100.0,Thom,Cockburn,Thomas,Cockburn,,,,...,,,,,,,LEM,07/24/02 16:01:21,LEM,07/24/02 16:01:23
2,A/EGD/1000,EGD,1000.0,Christian,Aitkenhead,Christine,Aikenhead,,,,...,,,,,,,LEM,07/18/01 16:13:27,jhm,10/01/02 10:48:24


In [8]:
data.shape

(3795, 205)

In [9]:
# Features we are interested in 
list_of_trial_features = ['Execution', 'Noreftocentral', 'Localwithcrep', 'Circuit', 'Cjtorder', 'Defence', 'High_status', 
                          'Male_accusers', 'Female_accusers', 'Confrontingsuspects', 'ActionDropped', 'Fled', 'Arrest', 
                          'Watching', 'TrialType']

list_of_accused_features = ['AccusedRef', 'M_Firstname', 'M_Surname', 'Alias', 'Patronymic', 'DesTitle', 'Sex', 'Age', 
                            'Res_settlement', 'Res_parish', 'Res_presbytery',  'Res_county', 'Res_burgh', 'Ethnic_origin', 
                            'MaritalStatus', 'SocioecStatus', 'Occupation', 'Notes']

list_of_cases_features = ['CaseRef', 'CaseStart_date', 'CaseStart_date_as_date', 'Case_date', 'Case_date_as_date', 'Age_at_case', 
                          'CaseCommonName', 'Complaint', 'Correspondence', 'Chronicle', 'Suspects_text', 'Familiars', 'Shape-Changing', 
                          'Dreams/Visions', 'UnorthodoxReligiousPractice', 'SympatheticMagic', 'Ridingdead', 'FolkNotes', 
                          'HumanIllness', 'HumanDeath', 'AnimalIllness', 'AnimalDeath', 'FemaleInfertility', 'MaleImpotence', 
                          'AggravatingDisease', 'TransferringDisease', 'LayingOn', 'Removalbewitchment', 'Quarreling', 'Cursing', 
                          'Poisoning', 'RecHealer', 'HealingHumans', 'HealingAnimals', 'Midwifery', 'DiseaseNotes', 'PropertyDamage', 
                          'WeatherModification', 'OtherMaleficiaNotes', 'OtherChargesNotes', 'ClaimedBewitched', 'ClaimedPossessed', 
                          'AdmitLesserCharge', 'ClaimedNaturalCauses', 'Nodefence', 'DefenseNotes', 'CaseNotes', 'PoliticalMotive_s', 
                          'PropertyMotive_p', 'PropertyMotive_s', 'RefusedCharity_p', 'RefusedCharity_s', 'Treason_p', 'Treason_s', 
                          'Other_p', 'Other_s', 'OtherText', 'NotEnoughInfo_p', 'NotEnoughInfo_s', 'WhiteMagic_p', 'WhiteMagic_s', 
                          'Charnotes', 'DemonicPact', 'DevilNotes', 'WitchesMeeting', 'MeetingName', 'DevilPresent', 'Maleficium', 
                          'CommunalSex', 'DevilWorship', 'FoodAndDrink', 'Dancing', 'Singing', 'SingingText', 'OtherPractices', 
                          'MeetingNotes', 'Elphane/Fairyland', 'Food/Drink', 'SpecificVerbalFormulae', 'SpecificRitualActs', 
                          'UNorthodoxRelPract_p', 'UNorthodoxRelPract_s', 'Consulting_p', 'Consulting_s', 'Demonic_p', 'Demonic_s', 
                          'Demonic_possess_p', 'Demonic_possess_s', 'Fairies_p', 'Fairies_s', 'Folk_healing_p', 'Folk_healing_s', 
                          'Maleficium_p', 'Maleficium_s', 'Midwifery_p', 'Midwifery_s', 'ImplicatedByAnother_p', 'ImplicatedByAnother_s', 
                          'Neighbhd_dispute_p', 'Neighbhd_dispute_s', 'PoliticalMotive_p']

In [10]:
#Limit data to important features.
all_features = list_of_accused_features + list_of_cases_features + list_of_trial_features
df = data[all_features]

In [11]:
#Drop columns with more than 70% missing values.
percent_non_null = 0.7
df = df.dropna(axis = 1, thresh=df.shape[0]*percent_non_null)
df.head()

Unnamed: 0,AccusedRef,M_Firstname,M_Surname,Sex,Res_presbytery,Res_county,CaseRef,CaseStart_date,CaseStart_date_as_date,Case_date,...,Defence,High_status,Male_accusers,Female_accusers,Confrontingsuspects,ActionDropped,Fled,Arrest,Watching,TrialType
0,A/EGD/10,Marion,White,Female,Haddington,Haddington,C/EGD/21,3/4/1661,04/03/61 00:00:00,3/4/1661,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0
1,A/EGD/100,Thomas,Cockburn,Male,,Haddington,C/EGD/111,1591,06/01/91 00:00:00,1591,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
2,A/EGD/1000,Christine,Aikenhead,Female,,Dumfries,C/EGD/1011,6/5/1628,05/06/28 00:00:00,6/5/1628,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
3,A/EGD/1001,Janet,Ireland,Female,,Dumfries,C/EGD/1012,6/5/1628,05/06/28 00:00:00,6/5/1628,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,A/EGD/1002,Agnes,Henderson,Female,Stirling,Stirling,C/EGD/1013,3/7/1628,07/03/28 00:00:00,3/7/1628,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2.0


In [12]:
df.shape

(3795, 104)

In [13]:
df.describe(include="all")

Unnamed: 0,AccusedRef,M_Firstname,M_Surname,Sex,Res_presbytery,Res_county,CaseRef,CaseStart_date,CaseStart_date_as_date,Case_date,...,Defence,High_status,Male_accusers,Female_accusers,Confrontingsuspects,ActionDropped,Fled,Arrest,Watching,TrialType
count,3594,3550,3558,3544,2799,3474,3788,2971.0,2971,3774,...,3211.0,3211.0,3209.0,3208.0,3211.0,3211.0,3211.0,3211.0,3211.0,3211.0
unique,3219,159,1168,2,74,34,3413,956.0,937,1164,...,,,,,,,,,,
top,A/EGD/285,Janet,Thomson,Female,Haddington,Haddington,C/EGD/830,1649.0,06/01/49 00:00:00,17/4/1662,...,,,,,,,,,,
freq,3,544,66,3030,459,658,3,62.0,62,93,...,,,,,,,,,,
mean,,,,,,,,,,,...,0.01308,0.008409,0.418199,0.255923,0.032077,0.012457,0.03052,0.151355,0.006851,2.536593
std,,,,,,,,,,,...,0.113635,0.091326,2.5105,1.627914,0.176233,0.110932,0.17204,0.35845,0.082502,0.927673
min,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
75%,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


In [14]:
percent_missing = df.isnull().sum() * 100 / len(df)
missing_value_df = pd.DataFrame({'column_name': df.columns,
                                 'percent_missing': percent_missing})

In [15]:
missing_value_df.sort_values("percent_missing", ascending=False).head(50)

Unnamed: 0,column_name,percent_missing
Res_presbytery,Res_presbytery,26.245059
CaseStart_date,CaseStart_date,21.71278
CaseStart_date_as_date,CaseStart_date_as_date,21.71278
Female_accusers,Female_accusers,15.467721
Male_accusers,Male_accusers,15.44137
TrialType,TrialType,15.388669
Noreftocentral,Noreftocentral,15.388669
Localwithcrep,Localwithcrep,15.388669
Circuit,Circuit,15.388669
Cjtorder,Cjtorder,15.388669


In [16]:
df.columns

Index(['AccusedRef', 'M_Firstname', 'M_Surname', 'Sex', 'Res_presbytery',
       'Res_county', 'CaseRef', 'CaseStart_date', 'CaseStart_date_as_date',
       'Case_date',
       ...
       'Defence', 'High_status', 'Male_accusers', 'Female_accusers',
       'Confrontingsuspects', 'ActionDropped', 'Fled', 'Arrest', 'Watching',
       'TrialType'],
      dtype='object', length=104)

In [17]:
def summary(feature):
    print("The name of the feature is," , str(feature))
    print("The values are, ", feature.value_counts() )
    print("The ")
summary(df.CaseRef)

The name of the feature is, 0         C/EGD/21
1        C/EGD/111
2       C/EGD/1011
3       C/EGD/1012
4       C/EGD/1013
           ...    
3790     C/LA/3229
3791     C/LA/3230
3792     C/LA/3231
3793     C/LA/3232
3794     C/LA/3348
Name: CaseRef, Length: 3795, dtype: object
The values are,  C/EGD/830     3
C/EGD/401     3
C/EGD/521     3
C/EGD/1545    3
C/EGD/1379    3
             ..
C/EGD/1636    1
C/EGD/790     1
C/EGD/2240    1
C/EGD/2086    1
C/EGD/2459    1
Name: CaseRef, Length: 3413, dtype: int64
The 
