# Analyse false positives using linguistic methods

## Important constants

'out of date' would be replced by 'outdated' during linguistic analysis.

In [1]:
KEYWORDS = ['outdated', 'obsolete', 'deprecated', 'discouraged']
IRREL_SUBS = ['data', 'disk', 'sensor', 'browser', 'actor']

## Methods about linguistic analysis

We are using package 'spaCy'. We need to run `python -m spacy download en_core_web_sm` to get the language resource.

In [2]:
import spacy
from spacy.tokens  import Token, Doc, Span
from typing import List, Dict, Optional

# nlp = spacy.load("en_core_web_sm")

KEYWORDS = ['outdated', 'obsolete', 'deprecated', 'discouraged']

def analyse_sentence(sentence: spacy.tokens.Span) -> Optional[Dict]:
    """
    Analyse a sentence from a doc.
    punct: ? . !
    negative statement: not
    dep of keyword: ROOT or amod
    https://spacy.io/usage/linguistic-features#pos-tagging
    """
    ref_doc: List[Token] = sentence.doc
    text = str(sentence)
    if any([kw in text for kw in KEYWORDS]) is False:
        return None
    start, end = sentence.start, sentence.end - 1
    punctuation = ref_doc[end]
    target = ""
    negation = False
    for token in ref_doc[start:end]:
        t_dep = token.dep_
        if t_dep == 'neg': negation = True
        if token.text not in KEYWORDS: continue
        if t_dep == 'conj':
            for child in token.children:
                if child.dep_ in ['amod', 'acomp']:
                    t_dep = child.dep_
        if t_dep == 'ROOT':
            for child in token.children:
                if child.dep_ == 'nsubjpass':
                    target = child.text
        elif t_dep == 'amod':
            for child in token.children:
                if child.dep_ == 'attr':
                    target = child.text
        elif t_dep == 'acomp': # dep_ can be 'acomp'
            for t in ref_doc[start:end]:
                if t.dep_ == 'nsubj':
                    target = t.text
                    break
        else:
            print(f"{token.text}, {token.dep_}, {list(token.subtree)}\n{token.doc}\n")
            continue
    return {
        "modified_noun": target,
        "punctuation": punctuation,
        "negative_statement": negation,
        "text": sentence.text
    }

def analyse_content(text: str, nlp) -> Dict:
    """Analyse a text i.e. the comment."""
    clean_text = text.replace('out of date', 'outdated')
    document: Doc = nlp(clean_text)
    res = [analyse_sentence(s) for s in document.sents]
    analysis: List[dict] = [r for r in res if r is not None]
    neg_cnt = sum([r.get('negative_statement', 0) for r in analysis])
    if len(analysis) == 0:
        analysis.append(dict())
    output = {
        "cnt_keywords": len(analysis),
        "subject": analysis[0].get('modified_noun', ""),
        "punctuation": analysis[0].get('punctuation', ""),
        "negative_statement": neg_cnt,
        "sentence": analysis[0].get('text', '')
    }
    return output

def print_tokens(tokens: List[Token]):
    """Print a sequence of tokens."""
    print("Idx\tText\tdep\ttag\tpos\thead\tancestor\tchildren")
    for t in tokens:
        print(f"{t.i}\t{t.text}\t{t.dep_}"
              f"\t{t.tag_}\t{t.pos_}\t{t.head}"
              f"\t{list(t.ancestors)}\t{list(t.children)}")


## Methods about data processing

In [3]:
import pandas as pd

def append_heuristic_features(raw: pd.DataFrame, nlp) -> pd.DataFrame:
    """Add columns about linguistic features"""
    text_col: pd.Series = raw['Text']
    res_series: pd.Series = text_col.apply(analyse_content, args=(nlp,))
    feat_df = pd.DataFrame(res_series.to_list())
    desired_cols = ['Id', 'PostId', 'Score', 'Text', 'CreationDate',
                    'UserId', 'ContentLicense', 'of_answer', 'cnt_keywords', 'subject',
                    'punctuation', 'negative_statement', 'sentence']
    concat = pd.concat([raw.reset_index(), feat_df], axis=1)
    return concat[desired_cols]

def check_subject_relevance(raw: pd.DataFrame) -> pd.DataFrame:
    """Check the column 'subject' and tell whether it may be irrelevant."""
    is_irrele = raw['subject'].apply(lambda subj: any([kw in subj for kw in IRREL_SUBS]))
    bf_irrele = raw['sentence'].apply(lambda text: any([kw in text for kw in IRREL_SUBS]))
    return raw.assign(subj_irrel=is_irrele, include_irrel=bf_irrele)


## Work on data

In [4]:
PATH_filtered_comments = r"data\pipeline\outdated_comments_by_keywords.csv"
PATH_comments_feated = r"data\pipeline\obsol_com_keywords_heuri.csv"

In [None]:
if __name__ == "__main__":
    data_comments = pd.read_csv(PATH_filtered_comments)
    nlp = spacy.load("en_core_web_sm")
    comments_df1 = append_heuristic_features(data_comments, nlp)
    comments_df2 = check_subject_relevance(comments_df1)
    comments_df2.to_csv(PATH_comments_feated, index=False)