## A testbed to understand and test RITHM Parser's "ParseLogic" functionality

In [1]:
# Import necessary built-in (standard library) modules 
import sys

In [2]:
# Current working directory
!pwd

/home/jcolditz/Jupyter Notebooks


In [3]:
# Add RITHM Parser path and import necessary modules

#sys.path.append('/home/jcolditz/twitter/RITHM/parser') # direct path
sys.path.append('../twitter/RITHM/parser') # relative path

import parselogic 

In [4]:
#Display the functions and what not
help(parselogic)

Help on module parselogic:

NAME
    parselogic

DESCRIPTION
    Created on Tue May 29 20:18:09 2018
    @author: colditzjb

FUNCTIONS
    emojifile(efile='emojilist.csv')
    
    emojify(text)
    
    match(test, text)
        # This performs matching on text, using boolean test phrases
    
    reformat(text, mode=1.0, modes={'tsv': '1.0', 'csv': '1.5', 'two': '2.0', 'mac': '2.5', 'hum': '3.5', 'kws': '4.5'}, lcase=False, ht_include=True, emoji=None)
        ### REFORMAT FUNCTION
        # 
        # This reformats text so that it is keyword searchable or machine/human readable
        # Format selection relies on bandwidths of float numbers (as above, so below)
        #   "mode" argument options currently include: 
        #     1.0 'tsv' = Only replace tabs, and hard returns (TSV compatability) 
        #                 This is currently the default for output. 
        #     1.5 'csv' = Replace commas, tabs, and hard returns (CSV compatability) 
        #     2.0 'two' = Use O

In [5]:
# Data for testing
texts = ['THIS TWEET is about Vaping and water-vapor.',
        'But I also want to know more about #vape, #vapor, and #vapelyfe hashtags;',
        'or maybe I only want to know about "vape"',
        '...and I do like VaporWave music, though this is irrelevant!']

tests = ['vape',
        '#vape',
        'vapor',
        'vape & vapor',
        'vape | vapor',
        'vape & !vapor',
        'vap*',
        'vap* & ! know about',
        '*wave mus*',
        'vap* & !*wave']


In [6]:
# This is what happens when you call match() without pre-formatting text (there are no matches)

from parselogic import match
for t1 in tests[:2]: # display the first two test criteria, for an example
    print('---------------------------------------\n'+
          'TEST ARGUMENT: '+t1+
          '\n---------------------------------------\n'+
          'MATCH:\tORIGINAL TEXT:')
    for t2 in texts:
        matched = match(t1,t2)
        print(str(matched)+'\t'+t2)
    print('\n')
        

---------------------------------------
TEST ARGUMENT: vape
---------------------------------------
MATCH:	ORIGINAL TEXT:
False	THIS TWEET is about Vaping and water-vapor.
False	But I also want to know more about #vape, #vapor, and #vapelyfe hashtags;
False	or maybe I only want to know about "vape"
False	...and I do like VaporWave music, though this is irrelevant!


---------------------------------------
TEST ARGUMENT: #vape
---------------------------------------
MATCH:	ORIGINAL TEXT:
False	THIS TWEET is about Vaping and water-vapor.
False	But I also want to know more about #vape, #vapor, and #vapelyfe hashtags;
False	or maybe I only want to know about "vape"
False	...and I do like VaporWave music, though this is irrelevant!




### Basic tokenization and text formatting relies on the _parselogic.reformat()_ function.
This is a basic implementation for lower-caseing and whitespacing text so that it is easily searchable and matchable. Text remains in string format at this point. After text is formatted as such, _parselogic.match()_ will be able to implement Boolean matching on it.
#### Here are some _parselogic.match()_ Boolean test examples...

In [7]:
# Be sure to first reformat() using mode='kws' (or mode=4.5) in order to do proper matching

from parselogic import reformat
for t1 in tests:
    print('---------------------------------------\n'+
          'TEST ARGUMENT: '+t1+
          '\n---------------------------------------\n'+
          'MATCH:\tFORMATTED TEXT:')
    for t2 in texts:
        t2 = reformat(t2,mode='kws') # This is the important step!
        matched = match(t1,t2)
        print(str(matched)+'\t'+t2)
    print('\n')
    

---------------------------------------
TEST ARGUMENT: vape
---------------------------------------
MATCH:	FORMATTED TEXT:
False	 this tweet is about vaping and water - vapor .  
True	 but i also want to know more about # vape ,  # vapor ,  and # vapelyfe hashtags ;  
True	 or maybe i only want to know about  " vape "  
False	  .  .  . and i do like vaporwave music ,  though this is irrelevant !  


---------------------------------------
TEST ARGUMENT: #vape
---------------------------------------
MATCH:	FORMATTED TEXT:
False	 this tweet is about vaping and water - vapor .  
True	 but i also want to know more about # vape ,  # vapor ,  and # vapelyfe hashtags ;  
False	 or maybe i only want to know about  " vape "  
False	  .  .  . and i do like vaporwave music ,  though this is irrelevant !  


---------------------------------------
TEST ARGUMENT: vapor
---------------------------------------
MATCH:	FORMATTED TEXT:
True	 this tweet is about vaping and water - vapor .  
True	 but i a