In [1]:
import pandas as pd
import numpy as np
import math
from scipy.stats import entropy
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
import warnings
warnings.filterwarnings('ignore')

In [2]:
def injectNaN(x):
    if x == "_T":
        return np.nan
    else: 
        return x

In [3]:
def printDFInfo(df):
    for col in df.columns:
        ent = entropy(df[col].value_counts())
        if col == 'TOPIC':
            print(f'Column \"{col}\"\t\tΜissing values: {(df[col].isnull().sum() * 100 / len(df[col])):.2f}% || Entropy: {ent:.3f}')
        else:
            print(f'Column \"{col}\"\tΜissing values: {(df[col].isnull().sum() * 100 / len(df[col])):.2f}% || Entropy: {ent:.3f}')

In [4]:
data = pd.read_csv('VAW.csv')
data.head()

Unnamed: 0,DATAFLOW,FREQ: Frequency,TIME_PERIOD: Time,GEO_PICT: Pacific Island Countries and territories,TOPIC: Topic,INDICATOR: Indicator,SEX: Sex,AGE: Age,CONDITION: Women’s condition,VIOLENCE_TYPE: Type of violence,...,OUTCOME: Outcome,RESPONSE: Response,HELP_REASON: Reason for searching help,HELP_PROVIDER: Help provider,OBS_VALUE,UNIT_MEASURE: Unit of measure,UNIT_MULT: Unit multiplier,OBS_STATUS: Observation Status,DATA_SOURCE: Data source,OBS_COMMENT: Comment
0,SPC:DF_VAW(1.0),A: Annual,2013,CK: Cook Islands,VAW_TOPIC_001: Types of violence against women...,NUMPERRF: Number of persons in relative frequency,F: Female,Y15T64: 15-64,EVPART: Ever-partnered,CONT_ECON: At least one act of economic abusive,...,_T: Any,_T: Any,_T: Any,_T: Any,6.2,PERCENT: percent,,,FHSS,
1,SPC:DF_VAW(1.0),A: Annual,2013,CK: Cook Islands,VAW_TOPIC_001: Types of violence against women...,NUMPERRF: Number of persons in relative frequency,F: Female,Y15T64: 15-64,EVPART: Ever-partnered,EMO: Emotional violence,...,_T: Any,_T: Any,_T: Any,_T: Any,9.6,PERCENT: percent,,,FHSS,
2,SPC:DF_VAW(1.0),A: Annual,2013,CK: Cook Islands,VAW_TOPIC_001: Types of violence against women...,NUMPERRF: Number of persons in relative frequency,F: Female,Y15T64: 15-64,EVPART: Ever-partnered,EMO: Emotional violence,...,_T: Any,_T: Any,_T: Any,_T: Any,26.7,PERCENT: percent,,,FHSS,
3,SPC:DF_VAW(1.0),A: Annual,2013,CK: Cook Islands,VAW_TOPIC_001: Types of violence against women...,NUMPERRF: Number of persons in relative frequency,F: Female,Y15T64: 15-64,EVPART: Ever-partnered,PHYS: Physical violence,...,_T: Any,_T: Any,_T: Any,_T: Any,6.7,PERCENT: percent,,,FHSS,
4,SPC:DF_VAW(1.0),A: Annual,2013,CK: Cook Islands,VAW_TOPIC_001: Types of violence against women...,NUMPERRF: Number of persons in relative frequency,F: Female,Y15T64: 15-64,EVPART: Ever-partnered,PHYS: Physical violence,...,_T: Any,_T: Any,_T: Any,_T: Any,30.2,PERCENT: percent,,,FHSS,


In [5]:
data.columns = data.columns.to_series().apply(lambda x: x.split(':')[0])

for col in data.columns:
    try:
        data[col] = data[col].apply(lambda x: x.split(':')[0])
        data[col] = data[col].apply(injectNaN)
    except AttributeError:
        pass
    
data.head()

Unnamed: 0,DATAFLOW,FREQ,TIME_PERIOD,GEO_PICT,TOPIC,INDICATOR,SEX,AGE,CONDITION,VIOLENCE_TYPE,...,OUTCOME,RESPONSE,HELP_REASON,HELP_PROVIDER,OBS_VALUE,UNIT_MEASURE,UNIT_MULT,OBS_STATUS,DATA_SOURCE,OBS_COMMENT
0,SPC,A,2013,CK,VAW_TOPIC_001,NUMPERRF,F,Y15T64,EVPART,CONT_ECON,...,,,,,6.2,PERCENT,,,FHSS,
1,SPC,A,2013,CK,VAW_TOPIC_001,NUMPERRF,F,Y15T64,EVPART,EMO,...,,,,,9.6,PERCENT,,,FHSS,
2,SPC,A,2013,CK,VAW_TOPIC_001,NUMPERRF,F,Y15T64,EVPART,EMO,...,,,,,26.7,PERCENT,,,FHSS,
3,SPC,A,2013,CK,VAW_TOPIC_001,NUMPERRF,F,Y15T64,EVPART,PHYS,...,,,,,6.7,PERCENT,,,FHSS,
4,SPC,A,2013,CK,VAW_TOPIC_001,NUMPERRF,F,Y15T64,EVPART,PHYS,...,,,,,30.2,PERCENT,,,FHSS,


In [6]:
dropList = []

for col in data.columns:
    ent = entropy(data[col].value_counts())
    if ent == 0:
        dropList.append(col)
        print(f'Column \"{col}\" entropy: {ent:.2f} || ADDED TO DROPLIST')
    else:
        print(f'Column \"{col}\" entropy: {ent:.2f}')

Column "DATAFLOW" entropy: 0.00 || ADDED TO DROPLIST
Column "FREQ" entropy: 0.00 || ADDED TO DROPLIST
Column "TIME_PERIOD" entropy: 2.25
Column "GEO_PICT" entropy: 2.51
Column "TOPIC" entropy: 2.83
Column "INDICATOR" entropy: 0.00 || ADDED TO DROPLIST
Column "SEX" entropy: 0.00 || ADDED TO DROPLIST
Column "AGE" entropy: 0.00 || ADDED TO DROPLIST
Column "CONDITION" entropy: 0.90
Column "VIOLENCE_TYPE" entropy: 2.74
Column "PERPETRATOR" entropy: 0.99
Column "ACTUALITY" entropy: 0.69
Column "LIFEPER" entropy: 0.82
Column "OUTCOME" entropy: 3.05
Column "RESPONSE" entropy: 1.95
Column "HELP_REASON" entropy: 1.79
Column "HELP_PROVIDER" entropy: 1.61
Column "OBS_VALUE" entropy: 5.95
Column "UNIT_MEASURE" entropy: 0.00 || ADDED TO DROPLIST
Column "UNIT_MULT" entropy: 0.00 || ADDED TO DROPLIST
Column "OBS_STATUS" entropy: 0.00 || ADDED TO DROPLIST
Column "DATA_SOURCE" entropy: 1.45
Column "OBS_COMMENT" entropy: 1.81


In [7]:
data.drop(dropList, axis = 1, inplace = True)

In [8]:
printDFInfo(data)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 2.833
Column "CONDITION"	Μissing values: 52.38% || Entropy: 0.900
Column "VIOLENCE_TYPE"	Μissing values: 34.92% || Entropy: 2.744
Column "PERPETRATOR"	Μissing values: 12.70% || Entropy: 0.992
Column "ACTUALITY"	Μissing values: 41.27% || Entropy: 0.692
Column "LIFEPER"	Μissing values: 76.19% || Entropy: 0.817
Column "OUTCOME"	Μissing values: 73.02% || Entropy: 3.047
Column "RESPONSE"	Μissing values: 94.44% || Entropy: 1.946
Column "HELP_REASON"	Μissing values: 95.24% || Entropy: 1.792
Column "HELP_PROVIDER"	Μissing values: 96.03% || Entropy: 1.609
Column "OBS_VALUE"	Μissing values: 38.79% || Entropy: 5.948
Column "DATA_SOURCE"	Μissing values: 0.00% || Entropy: 1.450
Column "OBS_COMMENT"	Μissing values: 49.75% || Entropy: 1.807


In [9]:
print(data.shape)

(2016, 15)


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2016 entries, 0 to 2015
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   TIME_PERIOD    2016 non-null   int64  
 1   GEO_PICT       2016 non-null   object 
 2   TOPIC          2016 non-null   object 
 3   CONDITION      960 non-null    object 
 4   VIOLENCE_TYPE  1312 non-null   object 
 5   PERPETRATOR    1760 non-null   object 
 6   ACTUALITY      1184 non-null   object 
 7   LIFEPER        480 non-null    object 
 8   OUTCOME        544 non-null    object 
 9   RESPONSE       112 non-null    object 
 10  HELP_REASON    96 non-null     object 
 11  HELP_PROVIDER  80 non-null     object 
 12  OBS_VALUE      1234 non-null   float64
 13  DATA_SOURCE    2016 non-null   object 
 14  OBS_COMMENT    1013 non-null   object 
dtypes: float64(1), int64(1), object(13)
memory usage: 236.4+ KB


In [11]:
encode_data = data

In [12]:
#impute_data.head(100)

In [13]:
data_train_idx = []
data_test_idx = []

for index, row in data.iterrows():
    if (type(row['OUTCOME']) == float):
        data_test_idx.append(index)
    else:
        data_train_idx.append(index)

In [14]:
data_train = data.iloc[data_train_idx,:] 
data_test = data.iloc[data_test_idx,:] 

data_train.reset_index(drop = True, inplace = True)
data_test.reset_index(drop = True, inplace = True)

print(data_train.shape)
print(data_test.shape)

(544, 15)
(1472, 15)


In [15]:
printDFInfo(data_train)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 1.568
Column "CONDITION"	Μissing values: 52.94% || Entropy: 1.082
Column "VIOLENCE_TYPE"	Μissing values: 73.53% || Entropy: 0.995
Column "PERPETRATOR"	Μissing values: 26.47% || Entropy: 0.000
Column "ACTUALITY"	Μissing values: 73.53% || Entropy: 0.687
Column "LIFEPER"	Μissing values: 91.18% || Entropy: 0.000
Column "OUTCOME"	Μissing values: 0.00% || Entropy: 3.047
Column "RESPONSE"	Μissing values: 100.00% || Entropy: 0.000
Column "HELP_REASON"	Μissing values: 100.00% || Entropy: 0.000
Column "HELP_PROVIDER"	Μissing values: 100.00% || Entropy: 0.000
Column "OBS_VALUE"	Μissing values: 45.22% || Entropy: 5.223
Column "DATA_SOURCE"	Μissing values: 0.00% || Entropy: 1.450
Column "OBS_COMMENT"	Μissing values: 50.00% || Entropy: 1.800


In [16]:
printDFInfo(data_test)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 2.501
Column "CONDITION"	Μissing values: 52.17% || Entropy: 0.108
Column "VIOLENCE_TYPE"	Μissing values: 20.65% || Entropy: 2.819
Column "PERPETRATOR"	Μissing values: 7.61% || Entropy: 1.179
Column "ACTUALITY"	Μissing values: 29.35% || Entropy: 0.692
Column "LIFEPER"	Μissing values: 70.65% || Entropy: 0.630
Column "OUTCOME"	Μissing values: 100.00% || Entropy: 0.000
Column "RESPONSE"	Μissing values: 92.39% || Entropy: 1.946
Column "HELP_REASON"	Μissing values: 93.48% || Entropy: 1.792
Column "HELP_PROVIDER"	Μissing values: 94.57% || Entropy: 1.609
Column "OBS_VALUE"	Μissing values: 36.41% || Entropy: 5.851
Column "DATA_SOURCE"	Μissing values: 0.00% || Entropy: 1.450
Column "OBS_COMMENT"	Μissing values: 49.66% || Entropy: 1.790


In [17]:
dropList = []

for col in data_train.columns:
    ent = entropy(data_train[col].value_counts())
    if ent == 0:
        dropList.append(col)
        print(f'Column \"{col}\" entropy: {ent:.2f} || ADDED TO DROPLIST')
    else:
        print(f'Column \"{col}\" entropy: {ent:.2f}')

Column "TIME_PERIOD" entropy: 2.25
Column "GEO_PICT" entropy: 2.51
Column "TOPIC" entropy: 1.57
Column "CONDITION" entropy: 1.08
Column "VIOLENCE_TYPE" entropy: 1.00
Column "PERPETRATOR" entropy: 0.00 || ADDED TO DROPLIST
Column "ACTUALITY" entropy: 0.69
Column "LIFEPER" entropy: 0.00 || ADDED TO DROPLIST
Column "OUTCOME" entropy: 3.05
Column "RESPONSE" entropy: 0.00 || ADDED TO DROPLIST
Column "HELP_REASON" entropy: 0.00 || ADDED TO DROPLIST
Column "HELP_PROVIDER" entropy: 0.00 || ADDED TO DROPLIST
Column "OBS_VALUE" entropy: 5.22
Column "DATA_SOURCE" entropy: 1.45
Column "OBS_COMMENT" entropy: 1.80


In [18]:
data_train.drop(dropList, axis = 1, inplace = True)
data_test.drop(dropList, axis = 1, inplace = True)

In [19]:
printDFInfo(data_train)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 1.568
Column "CONDITION"	Μissing values: 52.94% || Entropy: 1.082
Column "VIOLENCE_TYPE"	Μissing values: 73.53% || Entropy: 0.995
Column "ACTUALITY"	Μissing values: 73.53% || Entropy: 0.687
Column "OUTCOME"	Μissing values: 0.00% || Entropy: 3.047
Column "OBS_VALUE"	Μissing values: 45.22% || Entropy: 5.223
Column "DATA_SOURCE"	Μissing values: 0.00% || Entropy: 1.450
Column "OBS_COMMENT"	Μissing values: 50.00% || Entropy: 1.800


In [20]:
printDFInfo(data_test)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 2.501
Column "CONDITION"	Μissing values: 52.17% || Entropy: 0.108
Column "VIOLENCE_TYPE"	Μissing values: 20.65% || Entropy: 2.819
Column "ACTUALITY"	Μissing values: 29.35% || Entropy: 0.692
Column "OUTCOME"	Μissing values: 100.00% || Entropy: 0.000
Column "OBS_VALUE"	Μissing values: 36.41% || Entropy: 5.851
Column "DATA_SOURCE"	Μissing values: 0.00% || Entropy: 1.450
Column "OBS_COMMENT"	Μissing values: 49.66% || Entropy: 1.790


In [21]:
manual_dropList = ['CONDITION', 'VIOLENCE_TYPE', 'ACTUALITY', 'DATA_SOURCE', 'OBS_COMMENT']
data_train.drop(manual_dropList, axis = 1, inplace = True)
data_test.drop(manual_dropList, axis = 1, inplace = True)

In [22]:
printDFInfo(data_train)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 1.568
Column "OUTCOME"	Μissing values: 0.00% || Entropy: 3.047
Column "OBS_VALUE"	Μissing values: 45.22% || Entropy: 5.223


In [23]:
printDFInfo(data_test)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 2.501
Column "OUTCOME"	Μissing values: 100.00% || Entropy: 0.000
Column "OBS_VALUE"	Μissing values: 36.41% || Entropy: 5.851


In [24]:
data_train.to_csv("data_train.csv", index = False)
data_test.to_csv("data_test.csv", index = False)

In [25]:
#encoding below----------------------------------------------------------------------------------

#instantiate both packages to use
encoder = OrdinalEncoder()
imputer = IterativeImputer(ExtraTreesRegressor())
# create a list of categorical columns to iterate over
cat_cols = ['GEO_PICT','TOPIC','CONDITION','VIOLENCE_TYPE','PERPETRATOR','ACTUALITY','LIFEPER', 
            'OUTCOME', 'RESPONSE','HELP_REASON', 'HELP_PROVIDER', 'DATA_SOURCE', 'OBS_COMMENT']

def encode(encode_data):
    '''function to encode non-null data and replace it in the original data'''
    #retains only non-null values
    nonulls = np.array(encode_data.dropna())
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    #encode date
    impute_ordinal = encoder.fit_transform(impute_reshape)
    #Assign back encoded values to non-null values
    encode_data.loc[encode_data.notnull()] = np.squeeze(impute_ordinal)
    return encode_data

for columns in cat_cols:
    encode(encode_data[columns])
    
impute_data = pd.DataFrame(np.round(imputer.fit_transform(encode_data)),columns = encode_data.columns)

#print(impute_data.info())
    
encode_data.head(100)

Unnamed: 0,TIME_PERIOD,GEO_PICT,TOPIC,CONDITION,VIOLENCE_TYPE,PERPETRATOR,ACTUALITY,LIFEPER,OUTCOME,RESPONSE,HELP_REASON,HELP_PROVIDER,OBS_VALUE,DATA_SOURCE,OBS_COMMENT
0,2013,0.0,0.0,1.0,0.0,5.0,,,,,,,6.2,1.0,
1,2013,0.0,0.0,1.0,5.0,5.0,0.0,,,,,,9.6,1.0,
2,2013,0.0,0.0,1.0,5.0,5.0,1.0,,,,,,26.7,1.0,
3,2013,0.0,0.0,1.0,9.0,5.0,0.0,,,,,,6.7,1.0,
4,2013,0.0,0.0,1.0,9.0,5.0,1.0,,,,,,30.2,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,2013,0.0,12.0,2.0,,5.0,,1.0,0.0,,,,3.6,1.0,
96,2013,0.0,12.0,2.0,,5.0,,1.0,16.0,,,,8.9,1.0,
97,2013,0.0,12.0,2.0,,5.0,,1.0,21.0,,,,8.8,1.0,
98,2013,0.0,13.0,3.0,,,,,7.0,,,,,1.0,


In [26]:
data_enc_train_idx = []
data_enc_test_idx = []

for index, row in encode_data.iterrows():
    if (math.isnan(row['OUTCOME'])): #== NaN
        data_enc_test_idx.append(index)
    else:
        data_enc_train_idx.append(index)

In [27]:
data_enc_train = encode_data.iloc[data_enc_train_idx,:] 
data_enc_test = encode_data.iloc[data_enc_test_idx,:] 

data_enc_train.reset_index(drop = True, inplace = True)
data_enc_test.reset_index(drop = True, inplace = True)

print(data_enc_train.shape)
print(data_enc_test.shape)

(544, 15)
(1472, 15)


In [28]:
printDFInfo(data_enc_train)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 1.568
Column "CONDITION"	Μissing values: 52.94% || Entropy: 1.082
Column "VIOLENCE_TYPE"	Μissing values: 73.53% || Entropy: 0.995
Column "PERPETRATOR"	Μissing values: 26.47% || Entropy: 0.000
Column "ACTUALITY"	Μissing values: 73.53% || Entropy: 0.687
Column "LIFEPER"	Μissing values: 91.18% || Entropy: 0.000
Column "OUTCOME"	Μissing values: 0.00% || Entropy: 3.047
Column "RESPONSE"	Μissing values: 100.00% || Entropy: 0.000
Column "HELP_REASON"	Μissing values: 100.00% || Entropy: 0.000
Column "HELP_PROVIDER"	Μissing values: 100.00% || Entropy: 0.000
Column "OBS_VALUE"	Μissing values: 45.22% || Entropy: 5.223
Column "DATA_SOURCE"	Μissing values: 0.00% || Entropy: 1.450
Column "OBS_COMMENT"	Μissing values: 50.00% || Entropy: 1.800


In [29]:
printDFInfo(data_enc_test)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 2.501
Column "CONDITION"	Μissing values: 52.17% || Entropy: 0.108
Column "VIOLENCE_TYPE"	Μissing values: 20.65% || Entropy: 2.819
Column "PERPETRATOR"	Μissing values: 7.61% || Entropy: 1.179
Column "ACTUALITY"	Μissing values: 29.35% || Entropy: 0.692
Column "LIFEPER"	Μissing values: 70.65% || Entropy: 0.630
Column "OUTCOME"	Μissing values: 100.00% || Entropy: 0.000
Column "RESPONSE"	Μissing values: 92.39% || Entropy: 1.946
Column "HELP_REASON"	Μissing values: 93.48% || Entropy: 1.792
Column "HELP_PROVIDER"	Μissing values: 94.57% || Entropy: 1.609
Column "OBS_VALUE"	Μissing values: 36.41% || Entropy: 5.851
Column "DATA_SOURCE"	Μissing values: 0.00% || Entropy: 1.450
Column "OBS_COMMENT"	Μissing values: 49.66% || Entropy: 1.790


In [30]:
dropList_enc = []

for col in data_enc_train.columns:
    ent = entropy(data_enc_train[col].value_counts())
    if ent == 0:
        dropList_enc.append(col)
        print(f'Column \"{col}\" entropy: {ent:.2f} || ADDED TO DROPLIST')
    else:
        print(f'Column \"{col}\" entropy: {ent:.2f}')

Column "TIME_PERIOD" entropy: 2.25
Column "GEO_PICT" entropy: 2.51
Column "TOPIC" entropy: 1.57
Column "CONDITION" entropy: 1.08
Column "VIOLENCE_TYPE" entropy: 1.00
Column "PERPETRATOR" entropy: 0.00 || ADDED TO DROPLIST
Column "ACTUALITY" entropy: 0.69
Column "LIFEPER" entropy: 0.00 || ADDED TO DROPLIST
Column "OUTCOME" entropy: 3.05
Column "RESPONSE" entropy: 0.00 || ADDED TO DROPLIST
Column "HELP_REASON" entropy: 0.00 || ADDED TO DROPLIST
Column "HELP_PROVIDER" entropy: 0.00 || ADDED TO DROPLIST
Column "OBS_VALUE" entropy: 5.22
Column "DATA_SOURCE" entropy: 1.45
Column "OBS_COMMENT" entropy: 1.80


In [31]:
data_enc_train.drop(dropList_enc, axis = 1, inplace = True)
data_enc_test.drop(dropList_enc, axis = 1, inplace = True)

In [32]:
printDFInfo(data_enc_train)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 1.568
Column "CONDITION"	Μissing values: 52.94% || Entropy: 1.082
Column "VIOLENCE_TYPE"	Μissing values: 73.53% || Entropy: 0.995
Column "ACTUALITY"	Μissing values: 73.53% || Entropy: 0.687
Column "OUTCOME"	Μissing values: 0.00% || Entropy: 3.047
Column "OBS_VALUE"	Μissing values: 45.22% || Entropy: 5.223
Column "DATA_SOURCE"	Μissing values: 0.00% || Entropy: 1.450
Column "OBS_COMMENT"	Μissing values: 50.00% || Entropy: 1.800


In [33]:
printDFInfo(data_enc_test)

Column "TIME_PERIOD"	Μissing values: 0.00% || Entropy: 2.253
Column "GEO_PICT"	Μissing values: 0.00% || Entropy: 2.513
Column "TOPIC"		Μissing values: 0.00% || Entropy: 2.501
Column "CONDITION"	Μissing values: 52.17% || Entropy: 0.108
Column "VIOLENCE_TYPE"	Μissing values: 20.65% || Entropy: 2.819
Column "ACTUALITY"	Μissing values: 29.35% || Entropy: 0.692
Column "OUTCOME"	Μissing values: 100.00% || Entropy: 0.000
Column "OBS_VALUE"	Μissing values: 36.41% || Entropy: 5.851
Column "DATA_SOURCE"	Μissing values: 0.00% || Entropy: 1.450
Column "OBS_COMMENT"	Μissing values: 49.66% || Entropy: 1.790


In [34]:
data_enc_train.to_csv("data_enc_train.csv", index = False)
data_enc_test.to_csv("data_enc_test.csv", index = False)