<h1> Pre-Processing 3 (networks only) </h1>

Steps:
- Lemmatization
- Removing different types of words (see below)

In [11]:
#imports
import pandas as pd
import pickle
import re
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\aksel\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aksel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
#import data after Pre-processing 2
df = pd.read_pickle("processed_data_2.pkl")

<h4> Time words </h4>

In [13]:
ntp_words = ['time', 'period', 'periods', 'duration', 'clock', 'temporal', 'spacetime', 'timespan', 'timespans', 'timeline', 'timelines', 'elapse', 'elapsed', 'length', 'timewise', 'velocity', 'pace', 'rate', 'tempo', 'pass', 'passing', 'passed']
ftp_words = ['quick','quicker', 'quickly', 'quickest', 'fast', 'faster', 'fastest', 'fastened', 'rapid','rapidly', 'short', 'shorter', 'shortly', 'shortest','speedy', 'speedy','speeded', 'speedier', 'hurry', 'hurried', 'swift', 'swifter', 'swiftly', 'haste', 'hasty', 'brisk', 'turbo', 'accelerate', 'acceleration', 'accelerated', 'accelerating']
stp_words = ['slow', 'slower', 'slowly', 'slows', 'slowed', 'slowest', 'slowing', 'slowdown', 'long', 'looong', 'longer', 'longer', 'longest', 'steady', 'deceleration', 'decelerate', 'decelerating', 'decelerated', 'dilatory', 'dilation', 'infinity', 'eternity', 'lengthy', 'prolonged', 'protracted', 'extended', 'unending', 'endless']
time_words = sorted(ntp_words + ftp_words + stp_words)


<h3> Lemmatization </h3>

- For example: 'months' -> 'month' or 'running' -> 'run'

- Don't do for time words. For example, it's useful to keep 'time' and 'times' separate.

In [14]:
corpus_list = []
for text in df.text:
    corpus_list += text

#list of tuples with Erowid vocabulary and NLTK POS tags
#POS = Part of Speech
corpus_tagged_tup = nltk.pos_tag(set(corpus_list)) #

#Erowid vocabulary with POS tags as dict
corpus_tagged = {}
for tup in corpus_tagged_tup:
    x, y = tup 
    corpus_tagged[x] = y


#Function return wordnet POS tag depending on type of NLTK POS tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

#iterates through df.text, lemmatizes all adjectives, verbs, nouns, adverbs
lemmatizer = WordNetLemmatizer()
for j, text in enumerate(df.text):
    for i, word in enumerate(text):
        if corpus_tagged[word].startswith(('J', 'V', 'N', 'R')):
            lemmatized_word = lemmatizer.lemmatize(word, pos = get_wordnet_pos(corpus_tagged[word]))
            if  lemmatized_word not in time_words: #don't lemmatize if word is time word
                df.loc[j, "text"][i] = lemmatized_word


<h3> Remove various types of words </h3>

The following will remove:
- NLTK stop words (see: https://www.geeksforgeeks.org/removing-stop-words-nltk-python/)
- Numbers 1-100 as words
- Names of substances and classes
- Rare words (occuring less than 10 times in the corpus)
- Some time words (seconds, minutes) - Due to their high frequency in language use, they dominate the pattern.
- Unimportant common words in the Erowid identified by Ballentine (2022)
- Unimportant common words identified by myself
- Words containing only 1 or 2 letters

<h6> Ballentine, G., Friedman, S. F., & Bzdok, D. (2022). Trips and neurotransmitters: Discovering principled patterns across 6850 hallucinogenic experiences. In Sci Adv (Vol. 8, Issue 11, p. eabl6989). https://doi.org/10.1126/sciadv.abl6989 </h6>


In [15]:
#stop words
stop_words = stopwords.words('english')

#numbers as words
numbers_as_words = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "twenty-one", "twenty-two", "twenty-three", "twenty-four", "twenty-five", "twenty-six", "twenty-seven", "twenty-eight", "twenty-nine", "thirty", "thirty-one", "thirty-two", "thirty-three", "thirty-four", "thirty-five", "thirty-six", "thirty-seven", "thirty-eight", "thirty-nine", "forty", "forty-one", "forty-two", "forty-three", "forty-four", "forty-five", "forty-six", "forty-seven", "forty-eight", "forty-nine", "fourty", "fourty-one", "forty-two", "fourty-three", "fourty-four", "fourty-five", "fourty-six", "fourty-seven", "fourty-eight", "fourty-nine", "fifty", "fifty-one", "fifty-two", "fifty-three", "fifty-four", "fifty-five", "fifty-six", "fifty-seven", "fifty-eight", "fifty-nine", "sixty", "sixty-one", "sixty-two", "sixty-three", "sixty-four", "sixty-five", "sixty-six", "sixty-seven", "sixty-eight", "sixty-nine", "seventy", "seventy-one", "seventy-two", "seventy-three", "seventy-four", "seventy-five", "seventy-six", "seventy-seven", "seventy-eight", "seventy-nine", "eighty", "eighty-one", "eighty-two", "eighty-three", "eighty-four", "eighty-five", "eighty-six", "eighty-seven", "eighty-eight", "eighty-nine", "ninety", "ninety-one", "ninety-two", "ninety-three", "ninety-four", "ninety-five", "ninety-six", "ninety-seven", "ninety-eight", "ninety-nine", "one hundred", "hundred"]


#substances and classes
substances = [ "lsd","psilocybin mushrooms","morning glory","2c-i","5-meo-dmt","dmt","25i-nbome","argyreia nervosa","2c-e","5-meo-dipt","amt","2c-t-7","2c-b","echinopsis pachanoi","dpt","5-meo-dalt","5-meo-mipt","2c-c","4-aco-dmt","psychotria viridis","dom","tma-2","doc","2c-p","2c-t-21","4-ho-mipt","2c-d","4-ho-dipt","1p-lsd","dob-dragonfly","4-acetoxy-dipt","dob","4-aco-det","doi","iboga","ibogaine","al-lad","anadenanthera peregrina","2c-t-2","ayahuasca","5-meo-amt","mimosahuasca","echinopsis peruviana","anadenanthera colubrina","lophophora williamsii","4-acetoxy-mipt","mescaline","dipt","2c-t-4","dxm","amanita muscaria","methoxetamine","3-meo-pcp","ketamine","nitrous oxide","pcp","mdma","mda","6-apb","butylone","ethylone","mephedrone","methylone","mdai","mbdb","iap","datura spp.","brugmansia spp.","scopolamine","dimenhydrinate","atropa belladona","cannabis spp.","diphenhydramine","heroine","zolpidem","piper methysticum","jwh-018","hydrocodone","fentanyl","alprazolam","melatonin","alcohol (hard)","sceletium tortuosum","leonotis leonurus","turnera diffusa","morphine","1,4-butaneidol","cyclobenzaprine","clonazepam","opium","piracetam","lorazepam","passiflora spp.","alcohol (beer-wine)","triazolam","mitragyna speciosa","tramadol","synthetic cannabis","oxycodone","gabapentin","papaver somniferum","methadone","alcohol","codeine","cannabinoid receptor agonists","buprenorphine","ether","valeriana officinalis","nymphaea caerulea","gbl","hash","lactuca virosa","hydromorphone","carisoprodol","zopiclone","diazepam","etizolam","barbiturates","meperidine","cocaine","myristica spp.","caffeine","mdpv","methylphenidate","ethylphenidate","ilex paraguariensis","ephedrine","dmae","coffea spp.","betel nut","2-aminoindan","amphetamine","metamphetamine","modafinil","nicotiana tabacum","crack","4-fluoroamphetamine","adrafinil","substituted piperazines","tfmpp","benzylpiperazine","atomoxetine","propylhexedrine","calea zacatechichi","silene undulata","paroxetine","bupropion","trazadone","sertraline","olanzapine","venlafaxine","quetiapine","mirtazapine","amitriptyline","hypericum perforatum","salvia divinorum","yohimbe","acorus calamus","nepeta cataria","heimia salicifolia","5-htp",
               'ritalin', 'changa', 'cigarette', 'speed', 'dextroamphetamine', 'klonopin', 'mdxx', 'opiate', "psychedelics", 'coke', 'yadda', 'vapor', 'shang', 'shrooms', 'lsd', 'psilocybin', 'mushroom', 'mushrooms', 'mushroom - p. cubensis', 'magic mushrooms', 'magic mushrooms (sclerotia)', "'mushrooms'", 'mushrooms  - p. cubensis', 'mushrooms - c. cyanescens', 'mushrooms - p cubensis', 'mushrooms - p. arcana', 'mushrooms - p. atlantis (sclerotia)', 'mushrooms - p. azurescens', 'mushrooms - p. azurescens?', 'mushrooms - p. baeocystis', 'mushrooms - p. cubenesis', 'mushrooms - p. cubenesis (amazonian strain)', 'mushrooms - p. cubensis', 'mushrooms - p. cubensis (albino penis envy)', 'mushrooms - p. cubensis (amazon strain)', 'mushrooms - p. cubensis (cambodian)', "mushrooms - p. cubensis ('cambodian')", 'mushrooms - p. cubensis (dried)', 'mushrooms - p. cubensis (ecuadorean)', 'mushrooms - p. cubensis (extract)', 'mushrooms - p. cubensis (fresh)', 'mushrooms - p. cubensis (golden teacher)', 'mushrooms - p. cubensis (golden teachers)', 'mushrooms - p. cubensis (in chocolate)', 'mushrooms - p. cubensis (mexican)', 'mushrooms - p. cubensis (mycelium)', 'mushrooms - p. cubensis (penis envy)', 'mushrooms - p. cubensis (pes amazonia)', 'mushrooms - p. cubensis (sclerotia)', 'mushrooms - p. cubensis (smoked)', 'mushrooms - p. cubensis (tasmanian strain)', 'mushrooms - p. cyanescens', 'mushrooms - p. cyanescens (smoked)', 'mushrooms - p. galindoi', 'mushrooms - p. mexicana', 'mushrooms - p. mexicana (fresh)', 'mushrooms - p. mexicana (sclerotia)', 'mushrooms - p. mexicana (truffles)', 'mushrooms - p. ovoideocystidiata', 'mushrooms - p. semilanceata', 'mushrooms - p. subaeruginosa', 'mushrooms - p. subbalteatus or p. papilionaceus', 'mushrooms - p. tampanensis', 'mushrooms - p. tampanensis (sclerotia)', 'mushrooms - p. tampanensis (truffles)', 'mushrooms - p. weilii', 'mushrooms - p. zapotecorum', 'mushrooms - panaeolus cyanescens', 'mushrooms - panaeolus subbalteatus', 'mushrooms - psilocybe cyanescens', 'mushrooms (blue vein)', 'mushrooms (dried)', 'mushrooms (edible)', 'mushrooms (extract)', 'mushrooms (fresh)', "mushrooms ('gold caps')", 'mushrooms (golden caps)', 'mushrooms (hawaiian)', 'mushrooms (in chocolate)', 'mushrooms (magic mushrooms)', 'mushrooms (magic)', 'mushrooms (mexican)', 'mushrooms (p. cubensis - mexican strain)', 'mushrooms (p. cubensis smoked)', 'mushrooms (p. cubensis)', 'mushrooms (p. cyanescens)', 'mushrooms (p. pelliculosa)', 'mushrooms (sclerotia)', 'mushrooms (smoked)', 'mushrooms (thai)', 'mushrooms- p. cubensis', 'mushroooms', 'cannabis', 'morning glorys', ' morning glory', 'mdma (ecstasy)', 'methamphetamine', 'ecstasy', 'meth', 'kratom', 'nutmeg', 'datura', 'amphetamines', 'cacti - t. pachanoi', 'kava', 'kava kava', 'absinthe', 'alcohol - hard', 'alcohol (rum)', 'alcohol (whiskey)', 'alcohol (mead)', 'alcohol (vodka)', 'alcohol - (liquor)', 'alcohol - (wine)', ' alcohol - hard', 'absinthe  (homemade)', 'absinthe (czech)', 'absinthe (homemade)', 'beer', 'wine', 'alcohol - beer', 'alcohol - wine', 'alcohol (mead)', 'alcohol - (wine)', 'alcohol - (wine)', 'alcohol - (wine)', '4-methylmethcathinone', '4-methylmethcathinone (mephedrone)', 'ambien', 'zolpidem (ambien)', ' zolpidem (ambien)', ' h.b. woodrose', 'h.b. woodrose seeds', 'woodrose', 'h.b. woodrose', 'h. b. woodrose', 'hawaiian baby woodrose seeds', 'hawaiian baby woodrose seeds', 'hawaiian baby woodrose seeds', 'h.b. woodrose (hbw)', 'catnip', 'cacti - t. peruvianus', '4-methylmethcathinone', 'brugmansia', 'brugmansia (tree datura)', 'brugmansia sanguinea', 'brugmansia suaveolens', 'damiana', 'tobacco', 'tobacco - cigarettes', 'tramadol (ultram)', 'blue lotus', 'dimenhydrinate (dramamine)', 'dramamine (dimenhydrinate)', 'dramamine (dimenhydrinate)', 'poppies - opium', 'poppies', 'poppies', 'oxycodone (oxycontin)', 'oxycontin', 'roxicodone', 'peyote', 'cannabis - hash', 'calamus', 'yerba mate', "st. john's wort", "st. john's wort", 'bromo-dragonfly', 'coffee', 'caffeine (coffee)', 'caffeine (coffee)', 'caffeine (coffee)', 'caffeine (coffee)', 'caffeine (coffee)', 'quetiapine (seroquel)', 'seroquel', 'gabapentin (neurontin)', 'neurontin', 'modafinil (provigil)', 'provigil', ' venlafaxine', 'venlafaxine (effexor)', 'effexor', 'paroxetine (paxil)', 'paxil', 'bupropion (wellbutrin)', 'wellbutrin', '4-Acetoxy-DET', 'diphenhydramine (benadryl)', 'benadryl', 'triazolam (halcion)', 'halcion', 'coca', 'valerian', 'piperazines - bzp', 'piperazines', 'piperazines - mcpp', 'piperazines - mcpp', 'piperazines - mcpp', 'silene capensis', 'passion flower', 'mimosa tenuiflora', 'dmt (extracted from m. tenuiflora)', 'dmt (extracted from m. tenuiflora)', 'belladonna', '1-4-butanediol', '1,4 butanediol', 'mipt', 'det', "salvia"	,"divinorum", "glory"
               "psychedelic", "entactogen", "entactogens", "deliriant", "depressant", "sedative", "stimulant", "oneirogens", "antidepressant", "antipsychotics"

             ]


#rare words
counter = Counter(corpus_list)
rare_words = Counter({k: c for k, c in counter.items() if c < 10})
rare_words = [k for k in rare_words.keys()]

#Time words
non_seed_time_words = ["second", "seconds", "minute", "minutes", "hour", "hours", "day", "days", "week", "weeks", "weekend", "weekends", "month", "months", "year", "years", "times", "spent"]


#words from Ballentine (2022)
ballentine_common_words = ['the', 'this', 'that', 'not', 'and', 'have', 'there', 'all', 'then', 'what', 'but', 'would', 'for', 'with', 'will', 'was', 'thing',
                'get', 'could', 'from', 'more', 'etc', 'who', 'out', 'another', 'like', 'too', 'while', 'about', 'more', 'less', 'way', 'on',
                'she', 'her', 'him', 'his', 'our', "i'm", 'i’m', 'are', 'can’t', "i'd", 'i’d',  'ich', 'der', 'das',
                "didn't", "don't", "dont", "i've", "it's", "wasn’t", "can't", "wouldn't", "couldn't", "couldn´t", "won't", "i'll",
                'them', 'were', 'they', 'through', 'back', 'being', 'only', 'also',
                'went', 'some', 'again', 'into', 'after', 'around', 'down', 'just', 'very', 'things', 'when', 'over', 'other', 'before',
                'because', 'which', 'took', 'than', 'before', 'still', 'didn’t',
                'it’s', 'i’ve', 'didnt', 'didn´t', 'couldnt', 'couldn’t', 'their',
                'don’t', "that's", 'won’t', 'und', 'che', 'que',
                'μg/kg', 'mgs', "mg's", 'hcl', 'indole',
                'pill', 'pills', 'pipe', 'smoke', 'smokes', 'smoked', 'smoking', 'blotter', 'tab', 'tabs', 'line', 'lines', 'dose', 'doses', 'dosage', 'hit', 'hits', 'bowl',
                'trip', 'trips', 'tripping', 'tripped', 'trippy', 'k.hole', 'k-hole', 'khole',
                'roll', 'rolls', 'rolling', 'rolled',
                'das', '1999', '1/2', 'ten', 'substance', 'load', 'cherek', '5:00', '2001', '300', 'you', 'josh',
            #    'you', 'seconds', 'months', 'days', 'weeks', 'years', 'second', 'month', 'day', 'week', 'year', 'hour', 'hours',

                'powder', 'crystals', 'vaporized', 'vaporize',  'roll', 'rolling', 'rolled', 'nasal', 'bong', 'foil', 'root', 'bark', 'cannabis', 'toke', 'heroin'
                'inject', 'injection', 'trip', 'pill', 'pills', 'injecting', 'insufflation', 'trips', 'tripping', 'tripped', 'trippy', 'pipe', 'rectal',
                'snort', 'smoked', 'snorting', 'snorted', 'insufflated', 'injected', 'blotter', 'tab', 'oral', 'orally', 'weed', 'exstasy',
                # 'body', 'experience', 'time', 'felt', 'feel', 'life', 'been', 'feeling', 'first', 'really', 'load', 'compound', 'effects',
                'hole', 'bump', 'bumps', 'drunk', 'clubbing', 'boyfriend', 'husband', 'wife',
                'syringe', 'needle', 'hospital',
                'vial', 'bag',
                'inject', 'drugs',
                'vials', 'caps', 'bottle', 'robo', 'robitussin', 'syrup', 'vicks', 'coricidin', 'cough', 'freebase', 'compound', 'bottles', 'brand', 'tussin', 'cpm', 'maleate',
                'chlorpheniramine', 'delsym', 'robotussin', 'joe', 'dex', 'dxm', 'prozac', '8oz', 'joint', 'pot',

                'rave', 'raves', 'club', 'night', 'party', 'friend', 'car', '2000', 'boyfriends', 'girlfriend', 'girlfriends', 'rollin',

                'die', 'nicht', 'mit', 'mir', 'darla', 'sich', 'mich', 'ist', 'ein', 'war', 'den',
                'noch', 'een', 'auch', 'dass', 'hatte', 'auf', 'von', 'meine', 'als', 'eine',
                'einen', 'alal', 'sie', 'het', 'dem', 'aus', 'mark', 'aber', 'nach', 'marijuana',
                'des', 'approx', 'wavy', 'john', 'burnt', 'wie', 'chris'
                


               ]



#my common words
my_common_words = [#adverbs
                   'now', 'well', 'first', "second", "third", "whole", "this", "every", "next", "one", "same", "however", "even", "something", "got", "probably", "without", "thereafter", "within", "aprox", "actually", "certain", "somehow", "rather", "least", "whatever", "whatsoever", 'wenn', 'whoever', 'whose', 'whenever', 'whichever', 'really', 'never', 'bit', 'ever', 'told', 'later', 'almost', 'away', 'quite', 'pretty', 'completely', 'always', 'finally', 'extremely', 'yet', 'maybe', 'soon', 'far', 'side', 'else', 'definitely', 'close', 'slightly', 'eventually', 'usually', 'already', 'together', 'ago', 'along', 'alone', 'sometimes', 'immediately', 'simply', 'somewhat', 'totally', 'instead', 'perhaps', 'exactly', 'anyway', 'often', 'especially', 'normally', 'absolutely', 'mostly', 'truly', 'anymore', 'nearly', 'fully', 'fairly', 'incredibly', 'easily', 'barely', 
                   'recently', 'somewhere', 'basically', 'literally', 'constantly', 'certainly', 'forward', 'twice', 'store', 'clearly', 
                   'possibly', 'particularly', 'everywhere', 'generally', 'apparently', 'entirely', 'corner', 'instantly', 'approximately', 'alot', 'previously', 'perfectly', 'apart', 'obviously', 'relatively', 'occasionally', 'honestly', 'highly', 'roughly', 'directly', 'becomes', 'gradually', 'shortly', 'seriously', 'rapidly', 'indeed', 'hangover', 'ahead', 'otherwise', 'hardly', 'thus', 'aside', 'strongly', 'luckily', 'anywhere', 'nowhere', 'properly', 'smooth', 'intensely', 'greatly', 'chose', 'heavily', 'mainly', 'potentially', 'seemingly', 'loop', 'naturally', 'currently', 'whatsoever', 'therefore', 'merely', 'mildly', 'hopefully', 'initially', 'regularly', 'sometime', 'uncontrollably', 'frequently', 'surprisingly', 'carefully', 'silly', 'strangely', 'significantly', 'thoroughly', 'hallway', 'simultaneously', 'necessarily', 'increasingly', 
                   'rarely', 'definately', 'ignore', 'sore', 'lovely', 'lightly', 'violently', 'weather', 'thankfully', 'repeatedly', 'halfway', 'surely', 'specifically', 'differently', 'vaguely', 'amazingly', 'tingly', 'ether', 'desperately', 'settle', 'typically', 'fortunately', 'ultimately', 'essentially', 'practically', 'heavenly', 'distinctly', 'oddly', 'lately', 'suicide', 'quietly', 'terribly', 'strongest', 'nicely', 'partially', 'originally', 'infinitely', 'safely', 'nonetheless', 'overly', 'afterward', 'randomly', 'ugly', 'farther', 'driver', 'altogether', 'purely', 'equally', 'immensely', 'horribly', 'recreationally',  'subside', 'severely', 'supposedly', 'closely', 'noticeably', 'pleasantly', 'vividly', 'lonely', 'correctly', 'consciously', 'trippin', 'promptly', 'comfortably', 'virtually', 'considerably', 'loudly', 'partly', 
                   'unusually', 'largely', 'accurately', 'anytime', 'plate', 'profoundly', 'accidentally', 'dramatically', 'profusely', 'meanwhile', 'favourite', 'primarily', 'grind', 'importantly', 'frantically', 'successfully', 'wildly', 'freely', 'insanely', 'moderately', 'interestingly', 'commonly', 'readily', 'smoothly', 'downward', 'unbelievably', 'upside', 'whoever', 'tightly', 'effectively', 'softly', 'overwhelmingly', 'ridiculously', 'silently', 'november', 'drastically', 'reasonably', 'genuinely', 'peanut', 'consistently', 'elsewhere', 'brightly', 'precisely', 'firmly', 'tore', 'ally', 'subtly','remotely', 'sang', 'remarkably', 'similarly', 'upward', 'weekly', 'completly', 'finely', 'smack', 'intently', 'jelly', 'newly', 'belong', 'tremendously', 'responsibly', 'temple', 'exceptionally', 'automatically', 'solely', 
                   'uncertain', 'wonderfully', 'poorly', 'actively', 'aimlessly', 'presumably', 'avid', 'extensively', 'truely', 'furthermore', 'anything', 'nothing', 'half', 'part', 'sure', 'done', 'enough', 'sort', 'including', "couple", "most"
                    #modal verbs
                    'might', 'may', 'must', 'shall', 'ought',
                    #cordinating conjunction
                    'either', 'neither', 'plus', 'minus',
                    #Preposition or subordinating conjunction
                    'though', 'since', 'outside', 'love', 'inside', 'although', 'begin', 'behind', 'across', 'except', 'upon', 'throughout', 'onto', 'beyond', 'near', 'whether', 'despite', 'background', 'laugh', 'unlike', 'per', 'awhile', 'super', 'toward',
                    #Adjective, comparative
                    'higher', 'lower', 'smaller', 'larger', 'bigger', 'greater', 'lot'
                    #Neutral adjectives
                    'little', 'last', 'many', 'large', 'small', 'able', 'light', 'hard', 'right', 'left', 'several', 'great', 'lot', "high", 
                    #Neutral verbs
                    "let", "decide", 'begin', 'seem', 'seems', 'come', 'want', 'know', 'become', 'sit', 'keep', 'happen', 'turn', 'close', 'give', 'call', 'end', 'go', 'get', 'take', 'try', 'come', 'talk', 'sit', 'walk', 'move', 'use', 'start', 'say', 'star', 'follow', 'following', 'stand', 'know', 'become', 'hold', 'leave', 'hear', 'bring', 'understand', 'giving', 'turn', 'know', 'make', 'find', 'put', 'tell', 'notice', 'do', 'show', "let", "stay"   
                    #Neutral nouns
                    "amount", "point" 
                    #Names
                    "chad", "dave", "nick", 'jacob', "chalya", "gunner", 'samantha'
                    "placeholder"
                    ]


remove_words_set = set(rare_words + stop_words + ballentine_common_words +  my_common_words + substances + numbers_as_words + non_seed_time_words)

#iterate through df.text and remove all words in remove_words_set
def remove_words(text):
    text = [w for w in text if w not in remove_words_set] 
    #remove 1-2 letter words
    text = [' '.join([w for w in i.split(' ') if len(w) >= 3]) for i in text] #remove words with less than 3 letters
    while("" in text):
        text.remove("")
    return text
    
df.text = df.text.loc[:].apply(remove_words)

In [16]:
#save
df.to_pickle("processed_data_3.pkl")