In [28]:
#imports
import numpy as np 
import pandas as pd 
from scipy import stats
from matplotlib import pyplot as plt
#from pandas_profiling import ProfileReport
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.utils import resample

#load data
train_x = pd.read_csv('TrainingSetValues.csv')
train_y = pd.read_csv('TrainingSetLabels.csv')
train = pd.merge(train_y,train_x, how='left', left_on = 'id', right_on='id')

test_x = pd.read_csv('TestSetValues.csv')

In [29]:
#denote columns to keep
col_to_drop = ['scheme_management', 'quantity_group', 'water_quality', 
               'payment_type', 'extraction_type', 'waterpoint_type_group', 
               'region_code', 'date_recorded', 'recorded_by']
#drop dolumns
train = train.drop(col_to_drop, axis = 1)

In [30]:
#make string columns lowercase
train['installer'] = train.installer.str.lower()
train['funder'] = train.funder.str.lower()

#replace 0 w/ other in funder column
train['funder'] = train['funder'].replace('0', 'other')

#generate an index of funders w/ more than 100 wells
funder_over_100 = train.groupby('funder')['status_group'].count().apply(lambda g: g>100)
funder_over_100 = funder_over_100[funder_over_100].index

#relabel all funders w/ less than 100 wells other
train['funder'] = train.apply(lambda row: row['funder'] if row['funder'] in funder_over_100 else 'other', axis = 1)

#replace 0 w/ other in installer column
train['installer'] = train['installer'].replace('0', 'other')

#generate an index of funders w/ more than 100 wells
installer_over_100 = train.groupby('installer')['status_group'].count().apply(lambda g: g>100)
installer_over_100 = installer_over_100[installer_over_100].index

#relabel all funders w/ less than 100 wells other
train['installer'] = train.apply(lambda row: row['installer'] if row['installer'] in installer_over_100 else 'other', axis = 1)

In [31]:
scheme_name_modes = pd.DataFrame(train.groupby('region').scheme_name.apply(lambda x: x.mode())).reset_index().drop('level_1', axis=1).set_index('region')['scheme_name'].to_dict()

subvillage_modes = pd.DataFrame(train.groupby('region').subvillage.apply(lambda x: x.mode())).reset_index().drop('level_1', axis=1).set_index('region')['subvillage'].to_dict()

permit_modes = pd.DataFrame(train.groupby('region').permit.apply(lambda x: x.mode())).reset_index().drop('level_1', axis=1).set_index('region')['permit'].to_dict()

public_meeting_modes = pd.DataFrame(train.groupby('region').public_meeting.apply(lambda x: x.mode())).reset_index().drop('level_1', axis=1).set_index('region')['public_meeting'].to_dict()

latitude_means = pd.DataFrame(train.groupby('region').latitude.apply(lambda x: x.mean())).reset_index().set_index('region')['latitude'].to_dict()

longitude_means = pd.DataFrame(train.groupby('region').longitude.apply(lambda x: x.mean())).reset_index().set_index('region')['longitude'].to_dict()

In [32]:
train.scheme_name = train.scheme_name.fillna(train.region.map(scheme_name_modes))
train.subvillage = train.subvillage.fillna(train.region.map(subvillage_modes))
train.permit = train.permit.fillna(train.region.map(permit_modes))
train.public_meeting = train.public_meeting.fillna(train.region.map(public_meeting_modes))

In [33]:
train.latitude = train.apply(lambda row: latitude_means[row.region] if row.latitude ==  -2.000000e-08 else row.latitude, axis=1)

train.longitude = train.apply(lambda row: longitude_means[row.region] if row.longitude ==  0 else row.longitude, axis=1)


In [34]:
test_x = test_x.drop(col_to_drop, axis = 1)

#make string columns lowercase
test_x['installer'] = test_x.installer.str.lower()
test_x['funder'] = test_x.funder.str.lower()

#replace 0 w/ other in funder column
test_x['funder'] = test_x['funder'].replace('0', 'other')


#relabel all funders w/ less than 100 wells other
test_x['funder'] = test_x.apply(lambda row: row['funder'] if row['funder'] in funder_over_100 else 'other', axis = 1)

#replace 0 w/ other in installer column
test_x['installer'] = test_x['installer'].replace('0', 'other')



#relabel all funders w/ less than 100 wells other
test_x['installer'] = test_x.apply(lambda row: row['installer'] if row['installer'] in installer_over_100 else 'other', axis = 1)

test_x.scheme_name = test_x.scheme_name.fillna(test_x.region.map(scheme_name_modes))
test_x.subvillage = test_x.subvillage.fillna(test_x.region.map(subvillage_modes))
test_x.permit = test_x.permit.fillna(test_x.region.map(permit_modes))
test_x.public_meeting = test_x.public_meeting.fillna(test_x.region.map(public_meeting_modes))

test_x.latitude = test_x.apply(lambda row: latitude_means[row.region] if row.latitude ==  -2.000000e-08 else row.latitude, axis=1)

test_x.longitude = test_x.apply(lambda row: longitude_means[row.region] if row.longitude ==  0 else row.longitude, axis=1)


In [36]:
test_x.to_csv('test2.0.csv', index=False)
train.to_csv('train2.0.csv', index=False)