<h1> Pre-Processing 3 (networks only) </h1>

Steps:
- Lemmatization
- Removing different types of words (see below)

In [None]:
#imports
import pandas as pd
import pickle
import re
import nltk
import nltk.data
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from collections import Counter

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

In [2]:
#import data after Pre-processing 2
df = pd.read_pickle("processed_data_2.pkl")

<h4> Time words </h4>

In [3]:
ntp_words = ['time', 'period', 'periods', 'duration', 'clock', 'temporal', 'spacetime', 'timespan', 'timespans', 'timeline', 'timelines', 'elapse', 'elapsed', 'length', 'timewise', 'velocity', 'pace', 'rate', 'tempo', 'pass', 'passing', 'passed']
ftp_words = ['quick','quicker', 'quickly', 'quickest', 'fast', 'faster', 'fastest', 'fastened', 'rapid','rapidly', 'short', 'shorter', 'shortly', 'shortest','speedy', 'speedy','speeded', 'speedier', 'hurry', 'hurried', 'swift', 'swifter', 'swiftly', 'haste', 'hasty', 'brisk', 'turbo', 'accelerate', 'acceleration', 'accelerated', 'accelerating']
stp_words = ['slow', 'slower', 'slowly', 'slows', 'slowed', 'slowest', 'slowing', 'slowdown', 'long', 'looong', 'longer', 'longer', 'longest', 'steady', 'deceleration', 'decelerate', 'decelerating', 'decelerated', 'dilatory', 'dilation', 'infinity', 'eternity', 'lengthy', 'prolonged', 'protracted', 'extended', 'unending', 'endless']
time_words = sorted(ntp_words + ftp_words + stp_words)


<h3> Lemmatization </h3>

- For example: 'months' -> 'month' or 'running' -> 'run'

- Don't do for time words. For example, it's useful to keep 'time' and 'times' separate.

In [4]:
corpus_list = []
for text in df.text:
    corpus_list += text

#list of tuples with Erowid vocabulary and NLTK POS tags
#POS = Part of Speech
corpus_tagged_tup = nltk.pos_tag(set(corpus_list)) #

#Erowid vocabulary with POS tags as dict
corpus_tagged = {}
for tup in corpus_tagged_tup:
    x, y = tup 
    corpus_tagged[x] = y


#Function return wordnet POS tag depending on type of NLTK POS tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return ''

#iterates through df.text, lemmatizes all adjectives, verbs, nouns, adverbs
lemmatizer = WordNetLemmatizer()
for j, text in enumerate(df.text):
    for i, word in enumerate(text):
        if corpus_tagged[word].startswith(('J', 'V', 'N', 'R')):
            lemmatized_word = lemmatizer.lemmatize(word, pos = get_wordnet_pos(corpus_tagged[word]))
            if  lemmatized_word not in time_words: #don't lemmatize if word is time word
                df.loc[j, "text"][i] = lemmatized_word

<h3> Remove various types of words </h3>

The following will remove:
- NLTK stop words (see: https://www.geeksforgeeks.org/removing-stop-words-nltk-python/)
- Numbers 1-100 as words
- Names of substances and classes
- Administeration of substances
- Rare words (occuring less than 10 times in the corpus)
- Some time words (seconds, minutes) - Due to their high frequency in language use, they dominate the pattern.
- Unimportant common words in the Erowid identified by Ballentine (2022)
- Unimportant common words identified by myself
- Words containing only 1 or 2 letters

<h6> Ballentine, G., Friedman, S. F., & Bzdok, D. (2022). Trips and neurotransmitters: Discovering principled patterns across 6850 hallucinogenic experiences. In Sci Adv (Vol. 8, Issue 11, p. eabl6989). https://doi.org/10.1126/sciadv.abl6989 </h6>


In [None]:
#stop words
stop_words = stopwords.words('english')

#numbers as words
numbers_as_words = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "twenty-one", "twenty-two", "twenty-three", "twenty-four", "twenty-five", "twenty-six", "twenty-seven", "twenty-eight", "twenty-nine", "thirty", "thirty-one", "thirty-two", "thirty-three", "thirty-four", "thirty-five", "thirty-six", "thirty-seven", "thirty-eight", "thirty-nine", "forty", "forty-one", "forty-two", "forty-three", "forty-four", "forty-five", "forty-six", "forty-seven", "forty-eight", "forty-nine", "fourty", "fourty-one", "forty-two", "fourty-three", "fourty-four", "fourty-five", "fourty-six", "fourty-seven", "fourty-eight", "fourty-nine", "fifty", "fifty-one", "fifty-two", "fifty-three", "fifty-four", "fifty-five", "fifty-six", "fifty-seven", "fifty-eight", "fifty-nine", "sixty", "sixty-one", "sixty-two", "sixty-three", "sixty-four", "sixty-five", "sixty-six", "sixty-seven", "sixty-eight", "sixty-nine", "seventy", "seventy-one", "seventy-two", "seventy-three", "seventy-four", "seventy-five", "seventy-six", "seventy-seven", "seventy-eight", "seventy-nine", "eighty", "eighty-one", "eighty-two", "eighty-three", "eighty-four", "eighty-five", "eighty-six", "eighty-seven", "eighty-eight", "eighty-nine", "ninety", "ninety-one", "ninety-two", "ninety-three", "ninety-four", "ninety-five", "ninety-six", "ninety-seven", "ninety-eight", "ninety-nine", "one hundred", "hundred"]



#substances and classes
substances = [' alcohol - hard', ' h.b. woodrose', ' morning glory', ' venlafaxine', ' zolpidem (ambien)', "'mushrooms'", '1,4 butanediol', '1,4-butaneidol', '1-4-butanediol', '1p-lsd', '2-aminoindan', '25i-nbome', '2c-b', '2c-c', '2c-d', '2c-e', '2c-i', '2c-p', '2c-t-2', '2c-t-21', '2c-t-4', '2c-t-7', '3-meo-pcp', '4-Acetoxy-DET', '4-acetoxy-dipt', '4-acetoxy-mipt', '4-aco-det', '4-aco-dmt', '4-fluoroamphetamine', '4-ho-dipt', '4-ho-mipt', '4-methylmethcathinone', '4-methylmethcathinone', '4-methylmethcathinone (mephedrone)', '5-htp', '5-meo-amt', '5-meo-dalt', '5-meo-dipt', '5-meo-dmt', '5-meo-mipt', '6-apb', 'absinthe', 'absinthe  (homemade)', 'absinthe (czech)', 'absinthe (homemade)', 'acetaminophen', 'acorus calamus', 'adderall', 'adrafinil', 'al-lad', 'alcohol', 'alcohol (beer-wine)', 'alcohol (hard)', 'alcohol (mead)', 'alcohol (mead)', 'alcohol (rum)', 'alcohol (vodka)', 'alcohol (whiskey)', 'alcohol - (liquor)', 'alcohol - (wine)', 'alcohol - (wine)', 'alcohol - (wine)', 'alcohol - (wine)', 'alcohol - beer', 'alcohol - hard', 'alcohol - wine', 'alprazolam', 'amanita muscaria', 'ambien', 'amitriptyline', 'amphetamine', 'amphetamines', 'amt', 'anadenanthera colubrina', 'anadenanthera peregrina', 'anita', 'antidepressant', 'antipsychotics', 'argyreia nervosa', 'atomoxetine', 'atropa belladona', 'ayahuasca', 'barbiturates', 'beer', 'belladonna', 'benadryl', 'benzodiazepine', 'benzodiazepine', 'benzos', 'benzylpiperazine', 'betel nut', 'blue lotus', 'bromo-dragonfly', 'brugmansia', 'brugmansia (tree datura)', 'brugmansia sanguinea', 'brugmansia spp.', 'brugmansia suaveolens', 'bupe', 'buprenorphine', 'bupropion', 'bupropion (wellbutrin)', 'butylone', 'cacti - t. pachanoi', 'cacti - t. peruvianus', 'cactus', 'caffeine', 'caffeine (coffee)', 'caffeine (coffee)', 'caffeine (coffee)', 'caffeine (coffee)', 'caffeine (coffee)', 'caffiene', 'calamus', 'calea', 'calea zacatechichi', 'cannabinoid receptor agonists', 'cannabis', 'cannabis - hash', 'cannabis spp.', 'carisoprodol', 'catnip', 'changa', 'charcoal', 'cigarette', 'clonazepam', 'coca', 'cocaine', 'codeine', 'coffea spp.', 'coffee', 'coke', 'cokedpsychedelic', 'crack', 'crack', 'cracker', 'cracker', 'crystal', 'cyclobenzaprine', 'damiana', 'datura', 'datura spp.', 'deliriant', 'delirium', 'depressant', 'det', 'detox', 'dextroamphetamine', 'diazepam', 'dilaudid', 'dimenhydrinate', 'dimenhydrinate (dramamine)', 'diphenhydramine', 'diphenhydramine (benadryl)', 'dipt', 'divinorum', 'dmae', 'dmt', 'dmt (extracted from m. tenuiflora)', 'dmt (extracted from m. tenuiflora)', 'dob', 'dob-dragonfly', 'doc', 'doi', 'dom', 'dope', 'dpt', 'dramamine', 'dramamine (dimenhydrinate)', 'dramamine (dimenhydrinate)', 'dxm', 'echinopsis pachanoi', 'echinopsis peruviana', 'ecstacy', 'ecstasy', 'effexor', 'entactogen', 'entactogens', 'ephedrine', 'ether', 'ethylone', 'ethylphenidate', 'etizolam', 'fentanyl', 'foxy', 'gabapentin', 'gabapentin (neurontin)', 'gbl', 'glory', 'gravol', 'h. b. woodrose', 'h.b. woodrose', 'h.b. woodrose (hbw)', 'h.b. woodrose seeds', 'halcion', 'hash', 'hawaiian baby woodrose seeds', 'hawaiian baby woodrose seeds', 'hawaiian baby woodrose seeds', 'heimia salicifolia', 'heroin', 'heroine', 'hydro', 'hydrocodone', 'hydromorphone', 'hypericum perforatum', 'iap', 'iboga', 'ibogaine', 'ilex paraguariensis', 'jwh', 'jwh-018', 'kanna', 'kava', 'kava kava', 'ketamine', 'klonopin', 'kratom', 'lactuca virosa', 'leonotis leonurus', 'lophophora williamsii', 'lorazepam', 'lsd', 'lsd', 'magic mushrooms', 'magic mushrooms (sclerotia)', 'mbdb', 'mda', 'mdai', 'mdma', 'mdma (ecstasy)', 'mdpv', 'mdxx', 'medicationbooster', 'medicine', 'melatonin', 'meperidine', 'mephedrone', 'mescaline', 'metamphetamine', 'meth', 'methadone', 'methamphetamine', 'methoxetamine', 'methylone', 'methylphenidate', 'mimosa tenuiflora', 'mimosahuasca', 'mipt', 'mirtazapine', 'mitragyna speciosa', 'modafinil', 'modafinil (provigil)', 'molly', 'morning glory', 'morning glorys', 'morphine', 'mushroom', 'mushroom - p. cubensis', 'mushrooms', 'mushrooms  - p. cubensis', "mushrooms ('gold caps')", 'mushrooms (blue vein)', 'mushrooms (dried)', 'mushrooms (edible)', 'mushrooms (extract)', 'mushrooms (fresh)', 'mushrooms (golden caps)', 'mushrooms (hawaiian)', 'mushrooms (in chocolate)', 'mushrooms (magic mushrooms)', 'mushrooms (magic)', 'mushrooms (mexican)', 'mushrooms (p. cubensis - mexican strain)', 'mushrooms (p. cubensis smoked)', 'mushrooms (p. cubensis)', 'mushrooms (p. cyanescens)', 'mushrooms (p. pelliculosa)', 'mushrooms (sclerotia)', 'mushrooms (smoked)', 'mushrooms (thai)', 'mushrooms - c. cyanescens', 'mushrooms - p cubensis', 'mushrooms - p. arcana', 'mushrooms - p. atlantis (sclerotia)', 'mushrooms - p. azurescens', 'mushrooms - p. azurescens?', 'mushrooms - p. baeocystis', 'mushrooms - p. cubenesis', 'mushrooms - p. cubenesis (amazonian strain)', 'mushrooms - p. cubensis', "mushrooms - p. cubensis ('cambodian')", 'mushrooms - p. cubensis (albino penis envy)', 'mushrooms - p. cubensis (amazon strain)', 'mushrooms - p. cubensis (cambodian)', 'mushrooms - p. cubensis (dried)', 'mushrooms - p. cubensis (ecuadorean)', 'mushrooms - p. cubensis (extract)', 'mushrooms - p. cubensis (fresh)', 'mushrooms - p. cubensis (golden teacher)', 'mushrooms - p. cubensis (golden teachers)', 'mushrooms - p. cubensis (in chocolate)', 'mushrooms - p. cubensis (mexican)', 'mushrooms - p. cubensis (mycelium)', 'mushrooms - p. cubensis (penis envy)', 'mushrooms - p. cubensis (pes amazonia)', 'mushrooms - p. cubensis (sclerotia)', 'mushrooms - p. cubensis (smoked)', 'mushrooms - p. cubensis (tasmanian strain)', 'mushrooms - p. cyanescens', 'mushrooms - p. cyanescens (smoked)', 'mushrooms - p. galindoi', 'mushrooms - p. mexicana', 'mushrooms - p. mexicana (fresh)', 'mushrooms - p. mexicana (sclerotia)', 'mushrooms - p. mexicana (truffles)', 'mushrooms - p. ovoideocystidiata', 'mushrooms - p. semilanceata', 'mushrooms - p. subaeruginosa', 'mushrooms - p. subbalteatus or p. papilionaceus', 'mushrooms - p. tampanensis', 'mushrooms - p. tampanensis (sclerotia)', 'mushrooms - p. tampanensis (truffles)', 'mushrooms - p. weilii', 'mushrooms - p. zapotecorum', 'mushrooms - panaeolus cyanescens', 'mushrooms - panaeolus subbalteatus', 'mushrooms - psilocybe cyanescens', 'mushrooms- p. cubensis', 'mushroooms', 'myristica spp.', 'narcotic', 'nepeta cataria', 'neurontin', 'nicotiana tabacum', 'nicotine', 'nitrous', 'nitrous oxide', 'nutmeg', 'nymphaea caerulea', 'nystagmus', 'oevs', 'olanzapine', 'oneirogens', 'opiate', 'opioid', 'opioids', 'opium', 'oxide', 'oxy', 'oxycodone', 'oxycodone (oxycontin)', 'oxycontin', 'oxys', 'papaver somniferum', 'paracetamol', 'paroxetine', 'paroxetine (paxil)', 'passiflora spp.', 'passion flower', 'paxil', 'pcp', 'percocet', 'peyote', 'piper methysticum', 'piperazines', 'piperazines - bzp', 'piperazines - mcpp', 'piperazines - mcpp', 'piperazines - mcpp', 'piracetam', 'plunger', 'poppies', 'poppies', 'poppies - opium', 'prescription', 'propylhexedrine', 'provigil', 'psilocybin', 'psilocybin mushrooms', 'psychedelics', 'psychotria viridis', 'quetiapine', 'quetiapine (seroquel)', 'resin', 'ritalin', 'roxicodone', 'sage', 'salvia', 'salvia divinorum', 'salvinorin', 'sceletium tortuosum', 'scopolamine', 'sedative', 'seroquel', 'serotonin', 'sertraline', 'shang', 'shrooms', 'silene capensis', 'silene undulata', 'snri', 'speed', 'ssri', "st. john's wort", "st. john's wort", 'stimulant', 'substituted piperazines', 'synthetic cannabis', 'tablet', 'tapered', 'tfmpp', 'thc', 'tma-2', 'tobacco', 'tobacco - cigarettes', 'tramadol', 'tramadol (ultram)', 'trazadone', 'triazolam', 'triazolam (halcion)', 'tryptamines', 'turkey', 'turnera diffusa', 'ultram', 'valerian', 'valeriana officinalis', 'valium', 'vapor', 'venlafaxine', 'venlafaxine (effexor)', 'vicodin', 'vyvanse', 'wellbutrin', 'wine', 'woodrose', 'xanax', 'yadda', 'yerba mate', 'yohimbe', 'zacatechichi', 'zolpidem', 'zolpidem (ambien)', 'zopiclone', 'zyprexa'
              "drug", "acid", "zyprexa", "tryptamine", "psychedelic", "zyprexa", "tryptamine", "vaporiser", "redose", "peter", "psychedelic", "stramonium", "shannon","atropine", "deliriants", "nightshade", "sceletium", "suboxone", "subutex", "zyprexa", "citalopram", "zoloft", "fluoxetine", "benzo", "cbd", "lotus", "rayanne", "opiates", "hashish", "alcoholic", "analgesia", "vodka", "apnea", "analgesic", "tylenol", "cigar", "antihistamine", "benedryl", "vaped", "trazodone", "tweaker", "mah", "desoxyn", "crank", "booster", "meph", "canker" , "entactogenic", "xtc", "albert", "antihistamine", "butane", "sinicuichi", "ferris", "cory", "bromo", "bodyload", "tryptamine", "phenethylamines", "zeta", "glauca", "stims", "vivarin", "gourd", "yerba", "caffine", "cigs", "modalert", "cigar", "nutmegs", "tachycardia", "adderal", "bzp", "espresso", "hookah", "piperazine", "phentermine", "nut", "ciggarette", "betel"]

              


administeration_type = ['taper', 'flower', 'seed', 'blunt', 'brownie', 'shots', 'poppy', 'munchies', 'puff', 'vein', 'spliff', 'bowls', 'sip', 
                        'tincture', 'tablet', 'crush', 'overdose', 'cigarettes', 'canister', 'cartridge', 'balloon', 'syph', 'jar', 'poundredose', 
                        'herb', 'sip', 'waterpipe', 'tablet', "pod", "capsule"]

#rare words
counter = Counter(corpus_list)
rare_words = Counter({k: c for k, c in counter.items() if c < 10})
rare_words = [k for k in rare_words.keys()]

#Time words
non_seed_time_words = ["second", "seconds", "minute", "minutes", "hour", "hours", "day", "days", "week", "weeks", "weekend", "weekends", "month", "months", "year", "years", "times", "spent"]


#words from Ballentine (2022)
ballentine_common_words = ['the', 'this', 'that', 'not', 'and', 'have', 'there', 'all', 'then', 'what', 'but', 'would', 'for', 'with', 'will', 'was', 'thing',
                'get', 'could', 'from', 'more', 'etc', 'who', 'out', 'another', 'like', 'too', 'while', 'about', 'more', 'less', 'way', 'on',
                'she', 'her', 'him', 'his', 'our', "i'm", 'i’m', 'are', 'can’t', "i'd", 'i’d',  'ich', 'der', 'das',
                "didn't", "don't", "dont", "i've", "it's", "wasn’t", "can't", "wouldn't", "couldn't", "couldn´t", "won't", "i'll",
                'them', 'were', 'they', 'through', 'back', 'being', 'only', 'also',
                'went', 'some', 'again', 'into', 'after', 'around', 'down', 'just', 'very', 'things', 'when', 'over', 'other', 'before',
                'because', 'which', 'took', 'than', 'before', 'still', 'didn’t',
                'it’s', 'i’ve', 'didnt', 'didn´t', 'couldnt', 'couldn’t', 'their',
                'don’t', "that's", 'won’t', 'und', 'che', 'que',
                'μg/kg', 'mgs', "mg's", 'hcl', 'indole',
                'pill', 'pills', 'pipe', 'smoke', 'smokes', 'smoked', 'smoking', 'blotter', 'tab', 'tabs', 'line', 'lines', 'dose', 'doses', 'dosage', 'hit', 'hits', 'bowl',
                'trip', 'trips', 'tripping', 'tripped', 'trippy', 'k.hole', 'k-hole', 'khole', 
                'roll', 'rolls', 'rolling', 'rolled',
                'das', '1999', '1/2', 'ten', 'substance', 'load', 'cherek', '5:00', '2001', '300', 'you', 'josh',
                'you', 'seconds', 'months', 'days', 'weeks', 'years', 'second', 'month', 'day', 'week', 'year', 'hour', 'hours',

                'powder', 'crystals', 'vaporized', 'vaporize',  'roll', 'rolling', 'rolled', 'nasal', 'bong', 'foil', 'root', 'bark', 'cannabis', 'toke', 'heroin'
                'inject', 'injection', 'trip', 'pill', 'pills', 'injecting', 'insufflation', 'trips', 'tripping', 'tripped', 'trippy', 'pipe', 'rectal',
                'snort', 'smoked', 'snorting', 'snorted', 'insufflated', 'injected', 'blotter', 'tab', 'oral', 'orally', 'weed', 'exstasy',
                # 'body', 'experience', 'time', 'felt', 'feel', 'life', 'been', 'feeling', 'first', 'really', 'load', 'compound', 'effects',
                'bump', 'bumps', 'drunk', 'clubbing', 'boyfriend', 'husband', 'wife', #'hole' - keep hole for black hole
                'syringe', 'needle', 'hospital',
                'vial', 'bag',
                'inject', 'drugs',
                'vials', 'caps', 'bottle', 'robo', 'robitussin', 'syrup', 'vicks', 'coricidin', 'cough', 'freebase', 'compound', 'bottles', 'brand', 'tussin', 'cpm', 'maleate',
                'chlorpheniramine', 'delsym', 'robotussin', 'joe', 'dex', 'dxm', 'prozac', '8oz', 'joint', 'pot',

                'rave', 'raves', 'club', 'night', 'party', 'friend', 'car', '2000', 'boyfriends', 'girlfriend', 'girlfriends', 'rollin',

                'nicht', 'mit', 'mir', 'darla', 'sich', 'mich', 'ist', 'ein', 'war', 'den', #die - both German pronoun and verb for death
                'noch', 'een', 'auch', 'dass', 'hatte', 'auf', 'von', 'meine', 'als', 'eine',
                'einen', 'alal', 'sie', 'het', 'dem', 'aus', 'mark', 'aber', 'nach', 'marijuana',
                'des', 'approx', 'wavy', 'john', 'burnt', 'wie', 'chris'
                

               ]



#my common words
my_common_words = [#adverbs
                   'now', 'well', 'first', "second", "third", "whole", "this", "every", "next", "one", "same", "however", "even", "something", "got", "probably", "without", "thereafter", "within", "aprox", "actually", "certain", "somehow", "rather", "least", "whatever", "whatsoever", 'wenn', 'whoever', 'whose', 'whenever', 'whichever', 'really', 'never', 'bit', 'ever', 'told', 'later', 'almost', 'away', 'quite', 'pretty', 'completely', 'always', 'finally', 'extremely', 'yet', 'maybe', 'soon', 'far', 'side', 'else', 'definitely', 'close', 'slightly', 'eventually', 'usually', 'already', 'together', 'ago', 'along', 'alone', 'sometimes', 'immediately', 'simply', 'somewhat', 'totally', 'instead', 'perhaps', 'exactly', 'anyway', 'often', 'especially', 'normally', 'absolutely', 'mostly', 'truly', 'anymore', 'nearly', 'fully', 'fairly', 'incredibly', 'easily', 'barely', 
                   'recently', 'somewhere', 'basically', 'literally', 'constantly', 'certainly', 'forward', 'twice', 'store', 'clearly', 
                   'possibly', 'particularly', 'everywhere', 'generally', 'apparently', 'entirely', 'corner', 'instantly', 'approximately', 'alot', 'previously', 'perfectly', 'apart', 'obviously', 'relatively', 'occasionally', 'honestly', 'highly', 'roughly', 'directly', 'becomes', 'gradually', 'shortly', 'seriously', 'rapidly', 'indeed', 'hangover', 'ahead', 'otherwise', 'hardly', 'thus', 'aside', 'strongly', 'luckily', 'anywhere', 'nowhere', 'properly', 'smooth', 'intensely', 'greatly', 'chose', 'heavily', 'mainly', 'potentially', 'seemingly', 'loop', 'naturally', 'currently', 'whatsoever', 'therefore', 'merely', 'mildly', 'hopefully', 'initially', 'regularly', 'sometime', 'uncontrollably', 'frequently', 'surprisingly', 'carefully', 'silly', 'strangely', 'significantly', 'thoroughly', 'hallway', 'simultaneously', 'necessarily', 'increasingly', 
                   'rarely', 'definately', 'ignore', 'sore', 'lovely', 'lightly', 'violently', 'weather', 'thankfully', 'repeatedly', 'halfway', 'surely', 'specifically', 'differently', 'vaguely', 'amazingly', 'tingly', 'ether', 'desperately', 'settle', 'typically', 'fortunately', 'ultimately', 'essentially', 'practically', 'heavenly', 'distinctly', 'oddly', 'lately', 'suicide', 'quietly', 'terribly', 'strongest', 'nicely', 'partially', 'originally', 'infinitely', 'safely', 'nonetheless', 'overly', 'afterward', 'randomly', 'ugly', 'farther', 'driver', 'altogether', 'purely', 'equally', 'immensely', 'horribly', 'recreationally',  'subside', 'severely', 'supposedly', 'closely', 'noticeably', 'pleasantly', 'vividly', 'lonely', 'correctly', 'consciously', 'trippin', 'promptly', 'comfortably', 'virtually', 'considerably', 'loudly', 'partly', 
                   'unusually', 'largely', 'accurately', 'anytime', 'plate', 'profoundly', 'accidentally', 'dramatically', 'profusely', 'meanwhile', 'favourite', 'primarily', 'grind', 'importantly', 'frantically', 'successfully', 'wildly', 'freely', 'insanely', 'moderately', 'interestingly', 'commonly', 'readily', 'smoothly', 'downward', 'unbelievably', 'upside', 'whoever', 'tightly', 'effectively', 'softly', 'overwhelmingly', 'ridiculously', 'silently', 'november', 'drastically', 'reasonably', 'genuinely', 'peanut', 'consistently', 'elsewhere', 'brightly', 'precisely', 'firmly', 'tore', 'ally', 'subtly','remotely', 'sang', 'remarkably', 'similarly', 'upward', 'weekly', 'completly', 'finely', 'smack', 'intently', 'jelly', 'newly', 'belong', 'tremendously', 'responsibly', 'temple', 'exceptionally', 'automatically', 'solely', 
                   'uncertain', 'wonderfully', 'poorly', 'actively', 'aimlessly', 'presumably', 'avid', 'extensively', 'truely', 'furthermore', 'anything', 'nothing', 'half', 'part', 'sure', 'done', 'enough', 'sort', 'including', "couple", "most"
                    #modal verbs
                    'might', 'may', 'must', 'shall', 'ought',
                    #cordinating conjunction
                    'either', 'neither', 'plus', 'minus',
                    #Preposition or subordinating conjunction
                    'though', 'since', 'outside', 'love', 'inside', 'although', 'begin', 'behind', 'across', 'except', 'upon', 'throughout', 'onto', 'beyond', 'near', 'whether', 'despite', 'background', 'laugh', 'unlike', 'per', 'awhile', 'super', 'toward',
                    #Adjective, comparative
                    'higher', 'lower', 'smaller', 'larger', 'bigger', 'greater', 'lot',
                    #Neutral adjectives
                    'little', 'last', 'many', 'large', 'small', 'able', 'light', 'hard', 'right', 'left', 'several', 'great', 'lot', "high", 
                    #Neutral verbs
                    "let", "decide", 'begin', 'seem', 'seems', 'come', 'want', 'know', 'become', 'sit', 'keep', 'happen', 'turn', 'close', 'give', 'call', 'end', 'go', 'get', 'take', 'try', 'come', 'talk', 'sit', 'walk', 'move', 'use', 'start', 'say', 'star', 'follow', 'following', 'stand', 'know', 'become', 'hold', 'leave', 'hear', 'bring', 'understand', 'giving', 'turn', 'know', 'make', 'find', 'put', 'tell', 'notice', 'do', 'show', "let", "stay",   
                    #placeholders
                    "PLACEHOLDER", "PERSON", "ORG", "GPE", "LOC", "DATE"
                    ]


remove_words_set = set(stop_words + numbers_as_words + substances + administeration_type + rare_words + non_seed_time_words + ballentine_common_words +  my_common_words)

#iterate through df.text and remove all words in remove_words_set
def remove_words(text):
    text = [w for w in text if w not in remove_words_set] 
    #remove 1-2 letter words
    text = [' '.join([w for w in i.split(' ') if len(w) >= 3]) for i in text] #remove words with less than 3 letters
    while("" in text):
        text.remove("")
    return text
    
df.text = df.text.loc[:].apply(remove_words)

In [6]:
#save
df.to_pickle("processed_data_3.pkl")