# Must haves:
1. **"combined_data_no_duplis.csv"**. I sent that on whatsapp chat
2. **"processor.py"**. Import this module. Is uploaded by Wei Jie on Git. I used it to expand contractions and remove accented characters.
3. **"stanfordcorenlp"**. Please do pip install stanfordcorenlp
4. The folder **stanford-corenlp-full-2018-10-05**. Download **stanford-corenlp-full-2018-10-05.zip** and extract the whole folder to somewhere on your computer. Go to https://stanfordnlp.github.io/CoreNLP/download.html and click "Download CoreNLP 3.9.2"
5. Other packages that y'all definitely have already

In [1]:
import spacy
import re
import numpy as np
import pandas as pd
import nltk
import time

from lib.processor import * # Import "processor.py" based on its location in your computer.

In [2]:
data = pd.read_csv("dataset.csv") # Import dataset based on its location in your computer.

In [3]:
data.shape

(15679, 2)

# Splitting the dataset to run!
1. Jingyang: **[:3500]**
2. Guanhao: **[3500:7000]**
3. Joel: **[7000:10500]**
4. Weijie: **[10500:]**

**"Un-comment" the relevant line in the next cell!!**

In [4]:
# data = data.iloc[:3500,]
# data = data.iloc[3500:7000,]
# data = data.iloc[7000:10500,]
# data = data.iloc[10500:,]

In [5]:
import re
alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov)"

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = re.sub(prefixes,"\\1<prd>",text)
    text = re.sub(websites,"<prd>\\1",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return np.array(sentences)

In [6]:
data['total_sentences'] = data.text.apply(split_into_sentences)
data['total_sentences'] = data['total_sentences'].apply(len)
data['num_sentences_with_adj_in_phrase'] = 0
data['num_sentences_with_adv_in_phrase'] = 0

## Change path below according to your desktop!

In [23]:
os.listdir('../../../Downloads/stanford-corenlp-full-2018-10-05')

['stanford-corenlp-full-2018-10-05']

In [24]:
from stanfordcorenlp import StanfordCoreNLP
from nltk import Tree

nlp2 = StanfordCoreNLP('../../../Downloads/stanford-corenlp-full-2018-10-05/stanford-corenlp-full-2018-10-05')

## Method 1: Traditional

Function below to check presence of **Adjectives** and **Adverbs** respectively, in **NP/VP**.

In [25]:
def adj(x):
    x = expand_contractions(remove_accented_chars(x))
    if ("JJ" in x):
        if (("VP" in x) | ("NP" in x)):
            if (("ADVP" not in x) & ("WHAVP" not in x) & ("WHNP" not in x)):
                return True

In [26]:
def adv(x):
    x = expand_contractions(remove_accented_chars(x))
    if ("RB" in x):
        if (("VP" in x) | ("NP" in x)):
            if (("ADVP" not in x) & ("WHAVP" not in x) & ("WHNP" not in x)):
                return True

Loop to add the results into the dataframe.

In [None]:
start = time.time()

for j in range(len(data.text)):
    adjcounter = 0
    advcounter = 0
    try:
        for k in (split_into_sentences(data.iloc[j,0])):
            temptree = Tree.fromstring(nlp2.parse(k))
            productions = temptree.productions()
            adjstatus = False
            advstatus = False
            for i in range(len(productions)):
                if (adj(str(productions[i]))):
                    adjstatus = True
                if (adv(str(productions[i]))):
                    advstatus = True
            if (adjstatus):
                adjcounter += 1
            if (advstatus):
                advcounter += 1
                    
        data.iloc[j, 3] = adjcounter
        data.iloc[j, 4] = advcounter
            
    except:
        pass
    
end = time.time()
end-start

# Save your table as e.g. "data_jingyang.csv"

**"Un-comment" the relevant line in the next cell!!**

In [None]:
nlp2.close()   # Website say not doing that will waste backend memory
# data.to_csv("data_jingyang.csv")
# data.to_csv("data_guanhao.csv")
# data.to_csv("data_joel.csv")
# data.to_csv("data_weijie.csv")

<h1><center>Ignore below!!</center></h1>

## Method 2: No loops but slow

In [None]:
def func2(x):
    if (adj(str(x))):
        return 1
    else:
        return 0
    
vectorize_output = (np.vectorize)(func2)

def funcon1(x):
    productions = np.asarray(Tree.fromstring(nlp2.parse(x)).productions())
    if (sum(vectorize_output(productions)>=1)):
        return 1
    else:
        return 0
        
vectorized_funcon1 = np.vectorize(funcon1)
np.sum(vectorized_funcon1(sentences_array[0]))

sentences_array = data.text.apply(split_into_sentences)

vectorized_funcon1 = np.vectorize(funcon1)
vectorized_eval = np.vectorize(lambda x: np.sum(vectorized_funcon1(x)))
start = time.time()
print(vectorized_eval(sentences_array[0:10]))

end = time.time()
print(end-start)