In [372]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Read in data

### Counted logs per cache with checklogs.py:

There are 12433 Caches in this dataset.
Number of Logs per cache is between 1 and 70, with a mean of 10.111960106169066.



### Compiled logdata in compilelogs.py:

Logs are counted and combined into 'good', 'bad', and 'neutral' according to their log type:
 
| category | log types |
| ---      | ---       |
| good     | "Found it", "Enable Listing", "Will Attend" |
| neutral  | "Write note", "Owner Maintenance", "Post Reviewer Note", "Announcement", "Attended", "Publish Listing", "Webcam Photo Taken", "Temporarily Disable Listing", "Update Coordinates", "Unarchive", "Archive" |
| bad      | = "Didn't find it", "Needs Maintenance", "Needs Archived" |


In [373]:
df = pd.read_csv('./data/compiled_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12433 entries, 0 to 12432
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   code               12433 non-null  object 
 1   name               12433 non-null  object 
 2   good_logs_num      12433 non-null  int64  
 3   neutral_logs_num   12433 non-null  int64  
 4   bad_logs_num       12433 non-null  int64  
 5   good_logs_txt      12399 non-null  object 
 6   neutral_logs_txt   5027 non-null   object 
 7   bad_logs_txt       4731 non-null   object 
 8   creator            12433 non-null  object 
 9   cache_type         12433 non-null  object 
 10  container          12433 non-null  object 
 11  difficulty         12433 non-null  float64
 12  terrain            12433 non-null  float64
 13  latitude           12433 non-null  float64
 14  longitude          12433 non-null  float64
 15  placed             12433 non-null  object 
 16  status             124

In [374]:
# Check out whether those log text nulls are just a lack of logs:

ch = sum(df[df['good_logs_num']>0]['good_logs_txt'].isna())
print(f'NULL good    log text cases where good_logs_num    >0: {ch}')
ch = sum(df[df['neutral_logs_num']>0]['neutral_logs_txt'].isna())
print(f'NULL neutral log text cases where neutral_logs_num >0: {ch}')
ch = sum(df[df['bad_logs_num']>0]['bad_logs_txt'].isna())
print(f'NULL bad     log text cases where bad_logs_num     >0: {ch}')


NULL good    log text cases where good_logs_num    >0: 0
NULL neutral log text cases where neutral_logs_num >0: 0
NULL bad     log text cases where bad_logs_num     >0: 0


Good.

In [375]:
df['good_logs_txt'].fillna('', inplace=True)
df['neutral_logs_txt'].fillna('', inplace=True)
df['bad_logs_txt'].fillna('', inplace=True)


In [376]:
# combine text for name + short_description + long_description + hint, 
# and keep dummy on whether the fields (except name) were populated or not:

cache_text = []

names = list(df['name'])
s_desc= list(df['short_description'].fillna(''))
l_desc= list(df['long_description'].fillna(''))
hints = list(df['hints'].fillna(''))

num_caches = len(names)

for n,name in enumerate(names):
    text_content = name
    if len(s_desc[n])>0:
        text_content += s_desc[n]
        s_desc[n] = 1
    else:
        s_desc[n] = 0
    if len(l_desc[n])>0:
        text_content += l_desc[n]
        l_desc[n] = 1
    else:
        l_desc[n] = 0
    if len(hints[n])>0:
        text_content += hints[n]
        hints[n] = 1
    else:
        hints[n] = 0
        
    cache_text.append(text_content)
    
    if n%500==0: 
        pct_complete = np.round(n*100/num_caches,3)
        print(f'{n} of {num_caches} caches, {pct_complete}% Complete...')


0 of 12433 caches, 0.0% Complete...
500 of 12433 caches, 4.022% Complete...
1000 of 12433 caches, 8.043% Complete...
1500 of 12433 caches, 12.065% Complete...
2000 of 12433 caches, 16.086% Complete...
2500 of 12433 caches, 20.108% Complete...
3000 of 12433 caches, 24.129% Complete...
3500 of 12433 caches, 28.151% Complete...
4000 of 12433 caches, 32.172% Complete...
4500 of 12433 caches, 36.194% Complete...
5000 of 12433 caches, 40.216% Complete...
5500 of 12433 caches, 44.237% Complete...
6000 of 12433 caches, 48.259% Complete...
6500 of 12433 caches, 52.28% Complete...
7000 of 12433 caches, 56.302% Complete...
7500 of 12433 caches, 60.323% Complete...
8000 of 12433 caches, 64.345% Complete...
8500 of 12433 caches, 68.366% Complete...
9000 of 12433 caches, 72.388% Complete...
9500 of 12433 caches, 76.41% Complete...
10000 of 12433 caches, 80.431% Complete...
10500 of 12433 caches, 84.453% Complete...
11000 of 12433 caches, 88.474% Complete...
11500 of 12433 caches, 92.496% Complete...

In [377]:

df['short_description'] = s_desc
df['long_description']  = l_desc
df['hints']             = hints
df['cache_text']        = cache_text



In [378]:
tbugs = df['travel_bugs'].fillna('')
tbugs = [1 if len(tb)>0 else 0 for tb in tbugs]
df['travel_bugs'] = tbugs

In [379]:
creators = df['creator'].unique()
created  = [df[df['creator']==c].shape[0] for c in creators]

In [380]:
# how many cache creators are mega-creators (>=50 caches)?

sum([1 for c in created if c>=50])

31

In [381]:
# Replace the name of the cache creator with "other" if they haven't created at least 50 caches:

creator_dict = {}
for n,c in enumerate(creators):
    if created[n]<50:
        creator_dict[c] = 'other'
    else:
        creator_dict[c] = c
        
new_creator = [creator_dict[c] for c in list(df['creator'])]
df['creator'] = new_creator


In [382]:
df['creator'].value_counts()

other                              9493
justinpike                          336
terri2south                         214
horseshoechamp                      190
gatorman83                          168
3-happy-campers                     164
CCSGA                               120
memomls                             118
Bateshavingfun                      113
finz2lr                             105
Arabia Alliance                      93
davidandbritt                        90
zanna                                82
89GTA & GEO PSYCHO                   77
TeamT17                              73
Cali9-1-1                            71
alienchauncey                        67
ChemInstr                            64
Funfinderone                         64
12stepkings                          64
Creekwader                           61
djstover                             61
horseshoechamp & Shine Run Crew      60
GTCACHERS                            60
opted-out user                       58


In [383]:
df['cache_type'].unique()

array(['T', 'U', 'B', 'R', 'M', 'V', 'W', 'I', 'E', 'C', 'X', 'Z', 'F'],
      dtype=object)

In [384]:
cache_types = {
    'T':'Traditional',
    'U':'Unknown/Mystery',
    'B':'Letterbox',
    'R':'Earth',
    'M':'Multi',
    'V':'Virtual',
    'W':'Webcam',
    'I':'Wherigo',
    'E':'Event',
    'C':'Cache In Trash Out',
    'X':'Maze Exhibit',
    'Z':'Mega event',
    'F':'Lost and Found Event'
}

# ref: https://gsak.net/help/hs21040.htm, https://www.geocaching.com/about/cache_types.aspx

ctypes = df['cache_type']
new_ctypes = [cache_types[c] for c in ctypes]
df['cache_type'] = new_ctypes

In [385]:
df['cache_type'].value_counts()

Traditional             9856
Unknown/Mystery         1737
Multi                    429
Earth                    140
Letterbox                111
Virtual                   90
Wherigo                   54
Event                      5
Cache In Trash Out         4
Webcam                     4
Maze Exhibit               1
Mega event                 1
Lost and Found Event       1
Name: cache_type, dtype: int64

In [386]:
df[df['cache_type']=='Traditional']['container'].value_counts()

Micro      4278
Small      2531
Regular    1890
Other       837
Unknown     253
Large        67
Name: container, dtype: int64

In [387]:
df[df['cache_type']!='Traditional']['container'].value_counts()

Micro      931
Regular    617
Small      528
Other      283
Unknown     92
Virtual     88
Large       38
Name: container, dtype: int64

In [388]:
ctypes = df['cache_type']
containers = df['container']

sizes = []
for n,c in enumerate(containers):
    if c=='Virtual': 
        s = 0
    elif c=='Micro' : 
        s = 1
    elif c=='Small':
        s = 2
    elif c=='Regular':
        s = 3
    elif c=='Large':
        s = 4
    else:
        if ctypes[n]=='Traditional':
            s = np.nan
        else:
            s = 0
    sizes.append(s)
    
df['size'] = sizes 

In [389]:
m = df[df['cache_type']=='Traditional']['size'].median()
m

2.0

In [390]:
df['size'].fillna(2,inplace=True);
df.drop(columns = ['container'], inplace=True)

In [391]:
df['placed'].head()

0    2008-04-29
1    2020-11-06
2    2020-09-26
3    2004-03-22
4    2019-09-02
Name: placed, dtype: object

In [392]:
from datetime import datetime

In [393]:
df['placed'] = pd.to_datetime(df.placed, format='%Y-%m-%d', exact=True)

In [394]:
df['placed'].head()

0   2008-04-29
1   2020-11-06
2   2020-09-26
3   2004-03-22
4   2019-09-02
Name: placed, dtype: datetime64[ns]

In [395]:
df['status'].value_counts()

A    12255
T      178
Name: status, dtype: int64

In [396]:
df['status'].replace('A',1,inplace=True) # Active
df['status'].replace('T',0,inplace=True) # Temporarily Disabled


In [397]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12433 entries, 0 to 12432
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   code               12433 non-null  object        
 1   name               12433 non-null  object        
 2   good_logs_num      12433 non-null  int64         
 3   neutral_logs_num   12433 non-null  int64         
 4   bad_logs_num       12433 non-null  int64         
 5   good_logs_txt      12433 non-null  object        
 6   neutral_logs_txt   12433 non-null  object        
 7   bad_logs_txt       12433 non-null  object        
 8   creator            12433 non-null  object        
 9   cache_type         12433 non-null  object        
 10  difficulty         12433 non-null  float64       
 11  terrain            12433 non-null  float64       
 12  latitude           12433 non-null  float64       
 13  longitude          12433 non-null  float64       
 14  placed

The data frame is ready for experimentation now. I'll start with some simple models, depending on the type of features.

In [398]:
df.to_csv('./data/cleaned_data.csv',index=False)

In [399]:
features_types = ['numerical', 'binary', 'text', 'categorical', 'datetime', 'identifiers']
features_sets  = [{'good_logs_num', 'neutral_logs_num', 'bad_logs_num', 
                    'difficulty', 'terrain', 'latitude', 'longitude', 'size', 'fav_points'}]
features_sets.append({'status', 'is_premium', 'short_description', 'long_description', 'hints', 'travel_bugs'})
features_sets.append({'good_logs_txt', 'neutral_logs_txt', 'bad_logs_txt', 'cache_text'})
features_sets.append({'creator', 'cache_type'})
features_sets.append({'placed'})
features_sets.append({'code', 'name'})



In [400]:
cols = list(df.columns)
ftypes = []
samples = []
for col in cols:
    if   col in features_sets[0]: ftypes.append('numerical')
    elif col in features_sets[1]: ftypes.append('binary')
    elif col in features_sets[2]: ftypes.append('text')
    elif col in features_sets[3]: ftypes.append('categorical')
    elif col in features_sets[4]: ftypes.append('datetime')
    elif col in features_sets[5]: ftypes.append('identifiers')
    samples.append(list(df[df[col]!=''][col])[0])
    
features = pd.DataFrame({'feature': cols, 'feature type': ftypes, 'sample': samples})

In [401]:
features

Unnamed: 0,feature,feature type,sample
0,code,identifiers,GC1A1E7
1,name,identifiers,Nickajack Two for One
2,good_logs_num,numerical,9
3,neutral_logs_num,numerical,0
4,bad_logs_num,numerical,1
5,good_logs_txt,text,Found with my sweetie TeamMonroe1. TFTC! Foun...
6,neutral_logs_txt,text,Congrats on the FTF Geaux Published There is ...
7,bad_logs_txt,text,Looked all over but no luck
8,creator,categorical,other
9,cache_type,categorical,Traditional


In [351]:
features.to_csv('./data/features.csv',index=False)