This workbook generates sample data for the app and writes to the sample directory. 

In [11]:
# Set up the global variables for the script

import json
import os
import pandas
import math
import random
import datetime
import uuid

""" 
For events we have selected a selection of Splunk T shirt sloans. This list was obtained by searching the web, it is not a definitive list and I suspect many were never printed :-)
"""

log_lines=["splunk> Finding your faults, just like mom.", 
    "splunk> because ninjas are too busy", 
    "splunk> All batbelt. No tights.", 
    "splunk> Digs deeper than a jealous spouse.", 
    "splunk> More flexible than an Olympic gymnast.", 
    "splunk> The mars rover of the IT landfill.", 
    "Splunk> The IT Search Engine.", 
    "Splunk> Be an IT superhero. Go home early.", 
    "Splunk> CSI: Logfiles.", 
    "Splunk> Needle. Haystack. Found.", 
    "Splunk> All batbelt. No tights.", 
    "Splunk> Finding your faults, just like mom.", 
    "Splunk> Australian for grep.", 
    "Splunk> 4TW", 
    "Splunk> See your world. Maybe wish you hadn’t.", 
    "Splunk> Like an F-18, bro.", 
    "Splunk> Now with more code!", 
    "Splunk> Winning the War on Error", 
    "Splunk> The Notorious B.I.G. D.A.T.A.", 
    "Splunk> Map. Reduce. Recycle.", 
    "Splunk> Take the sh out of IT.", 
    "Splunk> I like big data and I cannot lie.", 
    "splunk> I gotta fever, and the only cure is MOAR LICENSE!", 
    "splunk> The corkscrew for your vintage data.", 
    "splunk> Caught me on the server - Wasn't me.", 
    "splunk> \"\"\. nuff said.", 
    "splunk> These are the droids you are looking for", 
    "splunk> Finding disturbances in the Force before the Jedi Masters", 
    "splunk> don't get caught up in the game of pwns", 
    "splunk> We enjoy breaks more than Unions", 
    "splunk> We line break for regular expressions", 
    "splunk> The bran for your system", 
    "splunk> Open a can of whooparse", 
    "splunk> Show me your logs", 
    "splunk> Rhymes with drunk", 
    "splunk> Chasing tail since 2003", 
    "splunk> this way: Run-D.M.C.", 
    "splunk> Walking War Room!!", 
    "splunk> IT like you mean it", 
    "splunk ML> Solve problems you didn't know you were about to have", 
    "Splunk> see the forest, and the trees", 
    "Splunk> data with destiny", 
    "Splunk> see the light before you tunnel"]

"""
The script generates events randomly over a time range, by default this goes back 5 days and generates a 1000 events each time.
"""
date_range_days=5
sample_readings=1000
seconds_in_day=24*60*60

In [12]:
"""
This generates a list of events where the time stamps switches between 3 different timestamps.

This is a common problem in badly designed Splunk instances. People open a TCP port and then fire all sorts of different data in there. 

Ideally we would create multiple sourcetypes and then assign a TCP port for each sourcetype. However the example shows you how to patch the problem during ingestion.
"""

# Firstly we will create a list of times for the events
datetimes =[]
for i in range(1,sample_readings) :
    random_seconds = random.randrange(1, date_range_days*seconds_in_day)
    datetimes.append(datetime.timedelta(seconds=-random_seconds))

# We want the events to be sorted from the earliest to the latest
# this isn't strickly necessary, but more realistic.
datetimes.sort(reverse=True)

# our three different date time formats
datetime_format = ["%Y-%m-%d %H:%M:%S", "%H:%M:%S %y-%m-%d", "%c"]

# create out output file
mutliplexed_datetime_formats = open("sample/conflicting_dates/mutliplexed_datetime_formats.log","w")

# iterate through the list of date timesn and write out to disk
for i in datetimes :
    # select a timeformat at random and use it
    time=(datetime.datetime.now()-i).strftime(random.choice(datetime_format))
    # pick a random log line to use
    message=random.choice(log_lines)
    # write out the log file
    mutliplexed_datetime_formats.write(time+" "+message+"\n")

# close and flush the file
mutliplexed_datetime_formats.close()

In [8]:
"""
This script generates events where the date is embedded in the file name, but the timestamp is per event im the contents of the file.

We are going to create a map of dates to times, so that we can itterate through each day, create a file and fill with events for that day

To work around any weird rounding errors due to timezones we will generate the day, and the seconds separately
"""

# a map of dates to timings
date_map = {}

for i in range(1,sample_readings) :
    # lets pick a random day back in time
    random_day = random.randrange(0,date_range_days)
    # lets pick the number of seconds into that day
    random_seconds = random.randrange(1, seconds_in_day)
    # enter the time stamp into the map
    if random_day not in date_map :
        # we need to create a new entry into the map
        date_map[random_day] = [datetime.timedelta(days=-random_day, seconds=-random_seconds)]
    else:
        # the day already exists, lets append this new date time
        date_map[random_day].append(datetime.timedelta(days=-random_day, seconds=-random_seconds))

# itterate through the list of days in the map    
for i in date_map.keys() :
    # lets create the filename for the days events, named after the day "2020-02-12.log"
    filename="sample/compound_date_time/"+(datetime.datetime.now()-datetime.timedelta(days=-i)).strftime("%Y-%m-%d")+".log"
    # create the file
    file_for_day = open(filename,"w")
    # sort the dates into cronological order
    date_map[i].sort(reverse=True)
    # for each timestamp in the day create a log message
    for t in date_map[i] :
        # create the timestamp with hours and days only 
        time = (datetime.datetime.now()-t).strftime("%H:%M:%S")
        # write out the timestamp with a random log message
        file_for_day.write(time+" "+random.choice(log_lines)+"\n")
    # close the file and move on to the next day
    file_for_day.close()


In [13]:
"""
This script generates a csv with 'useless' columns that we don't want to add into tsidx because they will bloat the size of the bucket.

We use pandas to build the CSV file, set headers etc
"""

# create a pandas with some column headings describing the contents
useless_columns=pandas.DataFrame(columns=['primary_key', 'primary_value', 'repeated_field', 'random_nonsense', 'long_payload'])

# Create rows and assign values to the columns
for i in range(0,sample_readings) :
    useless_columns=useless_columns.append({'primary_key': i, 'primary_value': random.randint(0,999999), 'repeated_field': "same silly value", 'random_nonsense' : uuid.uuid4(), 'long_payload' : random.choice(log_lines)}, ignore_index=True)

# write out the CSV file
useless_columns.to_csv('sample/drop_useless_columns/useless_columns.csv', sep=',', encoding='utf-8', index=False)


In [9]:
"""
This script generates a data set for importing into directly into splunk. We have create sourcetype, source, host, index and then use INGEST_EVAL + REGEX to extract the fields and copy them into the relevant fields. 

The format aims to replicated the output of the following splunk search:
"""

indexes=['ingest_bob', 'ingest_tom', 'ingest_buttercup']
sourcetypes=[('ingest_bananas', "%c"), ('ingest_meat', "%Y-%m-%d %H:%M:%S"), ('ingest_pairs', "%H:%M:%S %y-%m-%d"), ('ingest_apples', "%d %a %Y %H:%M:%S")]
sources=['sea', 'ground', 'sky', 'tree']
hosts=['server', 'laptop', 'phone']

import_events = open('sample/import_data/encoded_splunk_events.txt',"w")

mutliplexed_datetime_formats = open("sample/conflicting_dates/mutliplexed_datetime_formats.log","w")
sep="%%%"

for i in datetimes :
    (sourcetype, datetime_format) = random.choice(sourcetypes)
    time=str((datetime.datetime.now()-i).timestamp())
    host=random.choice(hosts)
    source=random.choice(sources)
    index=random.choice(indexes)
    raw=(datetime.datetime.now()-i).strftime(datetime_format)+" "+random.choice(log_lines)
    row=time+sep+index+sep+host+sep+source+sep+sourcetype+sep+raw
    import_events.write(row+"\n")


import_events.close()

In [15]:
""" This script generates a basic set of events for importing into splunk """

datetimes =[]
for i in range(1,sample_readings) :
    random_seconds = random.randrange(1, date_range_days*seconds_in_day)
    datetimes.append(datetime.timedelta(seconds=-random_seconds))

datetimes.sort(reverse=True)

split_forwarding = open("sample/split_forwarding/events.log","w")

for i in datetimes :
    split_forwarding.write((datetime.datetime.now()-i).strftime("%Y-%m-%d %H:%M:%S")+" "+random.choice(log_lines)+"\n")

split_forwarding.close()


