In [8]:
import pandas as pd
import numpy as np
import csv
from collections import defaultdict, Counter

In [None]:
df = pd.read_csv("export_dataframe_stage1.csv")

## Explore apparent multiplicity of some columns

e.g. `Salinity1` and `Salinity2`. Without knowing the original context for these, are the values the same when both are not null?

Spoiler alert: the scraped version of the data indicates that these are measurements made at two (still unclear) separate times of the "sample", so that an average is possibly most appropriate to take. (But how long between the measurements? Consistently same time between measurements?).

In [13]:
multi_names = [
    ("ph1", "ph2"),
    ("Salinity1", "Salinity2"),
    ("SecchiDisk1", "SecchiDisk2"),
    ("DissolvedOxygen1", "DissolvedOxygen2"),
    ]

In [10]:
def two_column_mutex_null_test(df: pd.DataFrame, col1: str, col2: str, null_value='NULL') -> dict:
    """Assumes column values are True or False (bool python datatype).
    """
    return {"both_null": len(df[np.logical_and(df[col1] == null_value, df[col2] == null_value)]),
             "both_not_null": len(df[np.logical_and(~(df[col1] == null_value), ~(df[col2] == null_value))]),
            "first_null_only": len(df[np.logical_and(df[col1] == null_value, ~(df[col2] == null_value))]),
            "second_null_only": len(df[np.logical_and(~(df[col1] == null_value), df[col2] == null_value)]),
             "total": len(df)}

In [16]:
def both_not_null_filter(df: pd.DataFrame, col1: str, col2: str, null_value='NULL') -> pd.DataFrame:
    return df[np.logical_and(~(df[col1] == null_value), ~(df[col2] == null_value))]

In [24]:
for n1, n2 in multi_names:
    print("******")
    print(n1, n2)
    print(two_column_mutex_null_test(d2, n1, n2))
    test_df = both_not_null_filter(d2, n1, n2)
    print(len(test_df[test_df[n1] == test_df[n2]])/len(test_df), len(test_df))
    print(test_df[[n1, n2]].head())
    anoms = test_df[test_df[n1] != test_df[n2]]
    print(anoms[[n1, n2]].head())

******
ph1 ph2
{'both_null': 25162, 'both_not_null': 38788, 'first_null_only': 0, 'second_null_only': 1, 'total': 63951}
0.9296174074456017 38788
   ph1  ph2
0    7    7
1  6.5  6.5
2  5.5  5.5
3    7    7
4  6.6  6.6
        ph1   ph2
21069     7  6.75
21071     7  6.75
21072  6.75     7
21074     6  6.25
21076   6.8   7.5
******
Salinity1 Salinity2
{'both_null': 57758, 'both_not_null': 6192, 'first_null_only': 0, 'second_null_only': 1, 'total': 63951}
0.8959948320413437 6192
     Salinity1 Salinity2
3987        27        27
3988        24        24
3990        30        30
3991        28        28
3992        34        34
      Salinity1 Salinity2
21154         7         8
21185        29        30
21186        32        33
21188        30        31
21317        24        22
******
SecchiDisk1 SecchiDisk2
{'both_null': 57488, 'both_not_null': 6463, 'first_null_only': 0, 'second_null_only': 0, 'total': 63951}
0.8106142658208263 6463
     SecchiDisk1 SecchiDisk2
3987          99       

## Spot check values on a single day

In [206]:
day_lo = pd.to_datetime("2005-07-30 00:00:00")
day_hi = pd.to_datetime("2005-07-31 00:00:00")

def day_test(dt_val):
    return dt_val >= day_lo and dt_val < day_hi

In [210]:
dict(data[data["event_date"].apply(day_test)].iloc[0])

{'group_rid': 747,
 'GroupName': 'Sandy Creek Nature Center Volunteers',
 'site_rid': 580,
 'SiteName': 'Sandy  Creek',
 'SiteLocation': '33.9806  -83.3802',
 'event_rid': 3348,
 'event_date': Timestamp('2005-07-30 10:15:00'),
 'volunteer_time': '180',
 'data_entry': '9416',
 'participants': '5',
 'rain_24_hours': 'Intermittent Rain',
 'weather': 'Intermittent',
 'rain_hours': 'NULL',
 'rain_inches': 'NULL',
 'distance': 'NULL',
 'air_temperature': '24.5',
 'stream': 'NULL',
 'method': 'NULL',
 'wqi': 'NULL',
 'habitat': 'NULL',
 'Comments': 'NULL',
 'createdby': 'EPDMIG',
 'createddate': '3/10/17 11:48 PM',
 'chem_detail_rid': 12655,
 'air_temp': '24.5',
 'water_temp': '25',
 'calibrate': '0',
 'calibrate_comment': 'NULL',
 'chemical_comment': 'NULL',
 'do_saturation': '65.79',
 'reagent': 'NULL',
 'reagent_other': 'NULL',
 'ph1': '6.5',
 'ph2': '6.5',
 'DissolvedOxygen1': '5.5',
 'DissolvedOxygen2': '5.5',
 'Conductivity': 'NULL',
 'Salinity1': 'NULL',
 'Salinity2': 'NULL',
 'SecchiD

In [204]:
test_day = data[data["event_date"] == "2005-07-30 12:00:00"]

In [205]:
test_day

Unnamed: 0,group_rid,GroupName,site_rid,SiteName,SiteLocation,event_rid,event_date,volunteer_time,data_entry,participants,...,hold_end_datetime,min_temp,max_temp,three_M_plate,ecoli_idexx,fecal_coliform,ecoli_other,ecoli_other_unit,comments,warnings
2995,433,UOWN,477,Bear Creek Tributary,33.9679 -83.4973,3466,2005-07-30 12:00:00,60,6150,1,...,,,,1.0,,250.0,,,,
3021,433,UOWN,662,McNutt Creek,33.9265 -83.4276,3500,2005-07-30 12:00:00,60,6150,1,...,,,,1.0,,1500.0,,,,
3022,433,UOWN,663,Orange Trail Creek,33.9029 -83.3796,3501,2005-07-30 12:00:00,60,6150,1,...,,,,1.0,,-100.0,,,,
3023,433,UOWN,664,Middle Oconee River,33.9576 -83.4384,3502,2005-07-30 12:00:00,60,6150,1,...,,,,,,,,,,
3024,433,UOWN,476,Hunnicutt Creek,33.9581 -83.437,3503,2005-07-30 12:00:00,60,6150,1,...,,,,1.0,,-100.0,,,,
3025,433,UOWN,666,Brooklyn Creek,33.9476 -83.3941,3504,2005-07-30 12:00:00,60,6150,1,...,,,,1.0,,-100.0,,,,
3026,433,UOWN,667,Sandy Creek,33.9806 -83.3819,3505,2005-07-30 12:00:00,60,6150,1,...,,,,1.0,,570.0,,,,
3027,433,UOWN,668,Barber Creek,33.9691 -83.3883,3506,2005-07-30 12:00:00,60,6150,1,...,,,,1.0,,2100.0,,,,
3028,433,UOWN,669,Trail Creek,33.9544 -83.3659,3507,2005-07-30 12:00:00,60,6150,1,...,,,,1.0,,10.0,,,,
3029,433,UOWN,670,Barber Creek,33.9669 -83.3906,3508,2005-07-30 12:00:00,60,6150,1,...,,,,1.0,,10.0,,,,
