# Survey quality analysis

1. for each row, count the number of -2 or na entries
2. do a global average of -2s
3. do global averages of a) survey duration and b) last date - first survey date
4. split the above by surveyor / country, etc

In [71]:
%matplotlib inline

import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter 
from __future__ import division
import time
import datetime

In [38]:
uganda = pd.read_csv('Uganda.Clean.csv')
kenya = pd.read_csv('Kenya.Clean.csv')

In [None]:
surveyors = uganda['Srvyr']
surveyors

c = Counter(surveyors)

items = c.items()
sorted_items = sorted(items, key=lambda i: i[1])

bar_heights = [h for _, h in sorted_items]
bar_heights

labels = [l for l, _ in items]
labels

In [None]:
xs = [i + 0.1 for i, _ in enumerate(bar_heights)]
plt.bar(xs, bar_heights)
plt.xticks([i + 0.5 for i, _ in enumerate(sorted_items)], labels)
plt.show()

In [None]:
pd.unique(kenya['Q3.CoopSociety'])

In [None]:
pd.unique(uganda['Q3.CoopSociety'])

In [141]:
def count_null_values(row):
    nulls = [1 for c in row if c == -2]
    non_nulls = [1 for c in row if c != -2]
    bad, good, total = len(nulls), len(non_nulls), len(nulls) + len(non_nulls)
    return good / total

kenya['Quality'] = kenya.apply(lambda row: count_null_values(row), axis = 1)
uganda['Quality'] = uganda.apply(lambda row: count_null_values(row), axis = 1)

kenya_qual = kenya['Quality']
uganda_qual = uganda['Quality']

print "{:.0%}".format((kenya_qual).mean())
print kenya_qual.min()
print(kenya_qual.quantile([0.05, 0.25, 0.5, 0.75, 0.95]))
print uganda_qual.min()
print "{:.0%}".format((uganda_qual).mean())
print(uganda_qual.quantile([0.05, 0.25, 0.5, 0.75, 0.95]))


83%
0.75723830735
0.05    0.779510
0.25    0.804009
0.50    0.826281
0.75    0.848552
0.95    0.881514
Name: Quality, dtype: float64
0.737068965517
81%
0.05    0.767672
0.25    0.788793
0.50    0.808190
0.75    0.829741
0.95    0.855603
Name: Quality, dtype: float64


In [143]:
def convert_to_hours(duration):
    try:
        x = time.strptime(duration[4:], '%H:%M:%S')
        days = float(duration[0:1])
        return (datetime.timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds() / 60) + (days * 24 * 60)
    except:
        pass
    return 0

kenya['DurationInMinutes'] = kenya['Duration'].apply((lambda r: convert_to_hours(r)))
durations = kenya['DurationInMinutes']

print (kenya.groupby(['Srvyr']))[['Srvyr', 'DurationInMinutes', 'Quality']].aggregate(np.mean).sort('DurationInMinutes', ascending=False)
print (kenya.groupby(['Srvyr']))[['Srvyr', 'DurationInMinutes']].aggregate(np.sum).sort('DurationInMinutes', ascending=False)

print durations.min()
print durations.mean()
print durations.max()
print(durations.quantile([0.05, 0.25, 0.5, 0.75, 0.95]))

          DurationInMinutes   Quality
Srvyr                                
Milcah           118.843229  0.828786
Boaz              89.076829  0.835244
Kiprotik          49.726344  0.840829
Dismal            45.334028  0.821455
Gilbert           29.094565  0.804832
          DurationInMinutes
Srvyr                      
Milcah          3802.983333
Boaz            3652.150000
Kiprotik        3083.033333
Gilbert         1338.350000
Dismal          1088.016667
11.9666666667
63.2416260163
1641.2
0.05     18.096667
0.25     25.150000
0.50     36.416667
0.75     57.016667
0.95    169.413333
Name: DurationInMinutes, dtype: float64




In [144]:
uganda['DurationInMinutes']= uganda['Duration'].apply(lambda r: convert_to_hours(r) )
durations = uganda['DurationInMinutes']

print durations.min()
print durations.mean()
print durations.max()
print(durations.quantile([0.05, 0.25, 0.5, 0.75, 0.95]))


print (uganda.groupby(['Srvyr']))[['Srvyr', 'DurationInMinutes', 'Quality']].aggregate(np.mean).sort('DurationInMinutes', ascending=False)
print (uganda.groupby(['Srvyr']))[['Srvyr', 'DurationInMinutes']].aggregate(np.sum).sort('DurationInMinutes', ascending=False)



#>>> x = time.strptime('00:01:00,000'.split(',')[0],'%H:%M:%S')
#>>> datetime.timedelta(hours=x.tm_hour,minutes=x.tm_min,seconds=x.tm_sec).total_seconds()

0.0
73.540273224
562.383333333
0.05     18.326667
0.25     29.166667
0.50     37.600000
0.75     57.966667
0.95    345.543333
Name: DurationInMinutes, dtype: float64
           DurationInMinutes   Quality
Srvyr                                 
Doreen            158.733333  0.779297
Joan              143.320588  0.820677
Linda              73.824775  0.788560
Sunday1            65.227037  0.795642
Peter              47.232937  0.836669
Charlotte          43.941667  0.800306
Alex               41.291270  0.846983
Destiny            40.653810  0.813177
           DurationInMinutes
Srvyr                       
Doreen           5079.466667
Joan             4872.900000
Sunday1          2935.216667
Linda            2731.516667
Peter            1983.783333
Alex             1734.233333
Charlotte        1669.783333
Destiny          1422.883333


