In [4]:
# !pip3 install readability-lxml
# !python3 -m spacy download en_core_web_sm
# import spacy

# nlp = spacy.load("en_core_web_sm")

In [5]:
# !pip3 install wordcloud
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import itertools
import pickle
import os
from spacy.lang.en import English
import pandas as pd
import numpy as np
import os
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
from readability import Document

# Classifier

In [9]:
PoS = ["NN", "PRP", "IN", "DT", "RB", "JJ", "VB", "CC", "NNS", "VBP"]
FirstPersonPronouns = [
    "i",
    "me",
    "mine",
    "my",
    "myself",
    "our",
    "ours",
    "ourselves",
    "us",
    "we",
]

if "SPACY_DATA" in os.environ:
    data_dir = os.environ["SPACY_DATA"]
else:
    data_dir = None
print("Load EN from %s" % data_dir)

nlp = English(data_dir=data_dir)


def getTextFromRecord(row):
    if not pd.isnull(row["body"]):
        rs = row["body"]
    else:
        rs = row["selftext"]
    if rs == "[deleted]":
        return None
    return rs


def addLexicalFeatures(df):
    from collections import Counter

    df["text"] = df.apply(getTextFromRecord, axis=1)

    def cosineSimilarity(sentence1, sentence2, NounsOrPronounsOnly=False):
        import math

        if not NounsOrPronounsOnly:
            vector1 = Counter([token.text.lower() for token in sentence1])
            vector2 = Counter([token.text.lower() for token in sentence2])
        else:
            vector1 = Counter(
                [
                    token.text.lower()
                    for token in sentence1
                    if token.tag_.startswith("N") or token.tag_.startswith("PR")
                ]
            )
            vector2 = Counter(
                [
                    token.text.lower()
                    for token in sentence2
                    if token.tag_.startswith("N") or token.tag_.startswith("PR")
                ]
            )
        intersection = set(vector1.keys()) & set(vector2.keys())
        numerator = sum([vector1[x] * vector2[x] for x in intersection])
        sum1 = sum([vector1[x] ** 2 for x in vector1.keys()])
        sum2 = sum([vector2[x] ** 2 for x in vector2.keys()])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
        if not denominator:
            return 0.0
        else:
            return float(numerator) / denominator

    def getDocumentSimilarity(text, NounsOrPronounsOnly=False):
        doc = nlp(str(text))
        sentences = list(doc.sents)
        if len(sentences) < 2:
            return None
        rs = 0
        for i in range(0, len(sentences) - 1):
            rs += cosineSimilarity(sentences[i], sentences[i + 1], NounsOrPronounsOnly)
        rs = rs / float(len(sentences) - 1)
        return rs

    def getPronounsCounter(text):
        doc = nlp(str(text))
        sentences = list(doc.sents)
        from collections import Counter

        pronouns = []
        for sentence in sentences:
            pronouns.extend(
                [token.text for token in sentence if token.tag_.startswith("PRP")]
            )

        pronouns = Counter(pronouns)
        pronounsNo = np.sum(list(pronouns.values()))
        return pd.Series(
            {
                "sentencesNo": len(sentences),
                "pronouns": pronouns,
                "pronounsNo": pronounsNo,
            }
        )

    def getDefiniteArticlesCounter(text):
        doc = nlp(str(text))
        sentences = list(doc.sents)
        from collections import Counter

        definiteArticles = []
        for sentence in sentences:
            definiteArticles.extend(
                [
                    token.text
                    for token in sentence
                    if (token.tag_ == "DT" and token.text.lower() == "the")
                ]
            )

        definiteArticles = Counter(definiteArticles)
        definiteArticlesNo = np.sum(list(definiteArticles.values()))
        return pd.Series({"definiteArticlesNo": definiteArticlesNo})

    def getFirstPersonPronounsCounter(counter):
        l = [k for k in counter.keys() if k.lower() in FirstPersonPronouns]
        rs = {}
        for k in l:
            rs[k] = counter[k]
        firstPersonPronouns = Counter(rs)
        firstPersonPronounsNo = np.sum(list(firstPersonPronouns.values()))
        return pd.Series(
            {
                "firstPersonPronouns": firstPersonPronouns,
                "firstPersonPronounsNo": firstPersonPronounsNo,
            }
        )

    df["documentSimilarity"] = df["text"].apply(getDocumentSimilarity)
    df["documentSimilarityNounsOrPronouns"] = df["text"].apply(
        getDocumentSimilarity, args=(True,)
    )
    df[["pronouns", "pronounsNo", "sentencesNo"]] = df["text"].apply(getPronounsCounter)
    df[["definiteArticlesNo"]] = df["text"].apply(getDefiniteArticlesCounter)
    df[["firstPersonPronouns", "firstPersonPronounsNo"]] = df["pronouns"].apply(
        getFirstPersonPronounsCounter
    )
    df["firstPersonPronounsRatio"] = df["firstPersonPronounsNo"] / df[
        "pronounsNo"
    ].astype(float)
    return df


def getSyntacticFeatures(row):
    text = getTextFromRecord(row)

    def getHeightToken(token):
        height = 1
        while token != token.head:
            height += 1
            token = token.head
        return height

    def getVerbPhrasesLength(sentence):
        rs = [0]
        inVerb = False
        for token in sentence:
            if token.pos_ == "VERB":
                if not inVerb:
                    i = 1
                inVerb = True
                i += 1
            if token.pos_ != "VERB":
                if inVerb:
                    rs.append(i - 1)
                inVerb = False
        return rs

    if text is None:
        return pd.Series(
            {
                "maxHeight": np.nan,
                "noun_chunks": np.nan,
                "maxVerbPhraseLength": np.nan,
                "subordinateConjuctions": np.nan,
            },
            dtype=object,
        )
    doc = nlp(str(text))
    noun_chunks = len(list(doc.noun_chunks))
    sentences = list(doc.sents)
    maxHeight = 1

    subordinateConjuctions = 0
    for sentence in sentences:
        subordinateConjuctions += len(
            [token for token in sentence if token.tag_ == "IN"]
        )
    subordinateConjuctions = subordinateConjuctions / float(len(sentences))

    for sentence in sentences:
        height = 1
        if sentence.end == 0:
            continue
        sentenceHeight = max([getHeightToken(token) for token in sentence])
        if maxHeight < sentenceHeight:
            maxHeight = sentenceHeight

    maxVerbPhraseLength = max(
        max([getVerbPhrasesLength(sentence) for sentence in sentences])
    )
    return pd.Series(
        {
            "maxHeight": maxHeight - 1,
            "noun_chunks": noun_chunks,
            "maxVerbPhraseLength": maxVerbPhraseLength,
            "subordinateConjuctions": subordinateConjuctions,
        },
        dtype=object,
    )


def getURLtoPostRatio(df):
    def containedInURL(row):
        return row["permalink"] not in row["url"]

    posts = df[df["parent_id"].astype(str) == "nan"]
    cnt = len(posts)
    urls = len(posts[posts.apply(containedInURL, axis=1)])
    return urls / float(cnt)


# def getData(df1, df2):
#     import readability
#     df1 = readability.prepare(df1)
#     df2 = readability.prepare(df2)

#     if "text" not in df1.columns:
#         df1["text"] = df1.apply(getTextFromRecord, axis=1)
#     if "text" not in df2.columns:
#         df2["text"] = df2.apply(getTextFromRecord, axis=1)
#     df1["class"] = "class1"
#     df2["class"] = "class2"

#     df1 = df1[["text", "class"]]
#     df2 = df2[["text", "class"]]
#     data = pd.concat([df1, df2])
#     data = data.reset_index()
#     del data["index"]
#     data = data.reindex(np.random.permutation(data.index))
#     return data


def preprocess_record(record):
    doc = Document(record)
    return doc.summary()

def preprocess_dataframe(df):
    df['text'] = df['text'].apply(preprocess_record)
    return df

def getData(df1, df2):
    df1_processed = preprocess_dataframe(df1)
    df2_processed = preprocess_dataframe(df2)

    if "text" not in df1_processed.columns:
        df1_processed["text"] = df1_processed.apply(getTextFromRecord, axis=1)
    if "text" not in df2_processed.columns:
        df2_processed["text"] = df2_processed.apply(getTextFromRecord, axis=1)

    df1_processed["class"] = "class1"
    df2_processed["class"] = "class2"

    df1_processed = df1_processed[["text", "class"]]
    df2_processed = df2_processed[["text", "class"]]

    data = pd.concat([df1_processed, df2_processed])
    data = data.sample(frac=1).reset_index(drop=True)
    
    return data




def bagOfWords(ngram_range=(2, 2)):
    pipeline = Pipeline(
        [
            (
                "count_vectorizer",
                CountVectorizer(ngram_range=ngram_range, analyzer="word"),
            ),
            ("classifier", MultinomialNB()),
        ]
    )
    return pipeline


def evaluate(pipeline, data, getScore=False):
    k_fold = KFold(n_splits=6)
    scores = []
    confusion = np.array([[0, 0], [0, 0]])
    precision = 0
    recall = 0
    for train_indices, test_indices in k_fold.split(data):
        train_text = data.iloc[train_indices]["text"].values
        train_y = data.iloc[train_indices]["class"].values.astype(str)

        test_text = data.iloc[test_indices]["text"].values
        test_y = data.iloc[test_indices]["class"].values.astype(str)

        pipeline.fit(train_text, train_y)
        predictions = pipeline.predict(test_text)

        confusion += confusion_matrix(test_y, predictions)
        score = f1_score(test_y, predictions, pos_label="class1")
        precision += precision_score(test_y, predictions, pos_label="class1")
        recall += recall_score(test_y, predictions, pos_label="class1")
        scores.append(score)

    print("\nTotal documents classified:", len(data))
    print("\nScore:", sum(scores) / len(scores))
    print("\nPrecision:", precision / len(scores))
    print("\nRecall:", recall / len(scores))
    print("\nConfusion matrix:")
    print(confusion)
    if getScore:
        return pipeline, sum(scores) / float(len(scores))
    return pipeline


def readResults(deviations=False):
    df = pd.read_pickle("./data/combinations-10fold.pkl")
    df = pd.DataFrame(df)
    for column in df.columns:
        for index in df.index:
            tmp = df[column][index]
            if tmp is None:
                continue
            if deviations:
                df[column][index] = np.std(tmp)
            else:
                df[column][index] = np.mean(tmp)
    return df



#binaryClassification.py

subreddits = ["./data/suicidewatch-sample", "./data/depression-sample"]
subreddits.sort()


ngram_range = (1, 2)
fname = "./data/combinations-10fold.pkl"
combinations = list(itertools.combinations(subreddits, 2))

for combination in combinations:
    if os.path.isfile(fname):
        with open(fname, "rb") as f:
            rs = pickle.load(f)
    else:
        rs = {}

    # if combination[0] in rs.keys():
    #     if combination[1] in rs[combination[0]].keys():
    #         print(combination, "already exists, skipping...")
    #         continue

    print("doing", combination[0], "-", combination[1])
    df1 = pd.read_pickle(combination[0] + ".pkl")
    df1 = df1.reset_index()
    df1["text"] = df1.apply(getTextFromRecord, axis=1)
    df1 = df1.reindex(np.random.permutation(df1.index))
    df2 = pd.read_pickle(combination[1] + ".pkl")
    df2 = df2.reset_index()
    df2["text"] = df2.apply(getTextFromRecord, axis=1)
    df2 = df2.reindex(np.random.permutation(df2.index))

    # keep only posts, keep only the text column
    df1 = df1[df1["parent_id"].astype(str) == "nan"]
    df1 = df1.dropna(subset=["text"])
    df2 = df2[df2["parent_id"].astype(str) == "nan"]
    df2 = df2.dropna(subset=["text"])


    results = []
    print("choosing min from", len(df1), len(df2))
    m = min(len(df1), len(df2))
    for i in range(0, 10):
        df1_min = df1.reindex(np.random.permutation(df1.index)).head(m)
        df2_min = df2.reindex(np.random.permutation(df2.index)).head(m)

        data = getData(df1_min, df2_min)
        print("got", len(data), "records...training")
        pipeline = bagOfWords(ngram_range=ngram_range)
        pipeline, score = evaluate(pipeline, data=data, getScore=True)
        results.append(score)

print("RESULTS:", combination, results)
if combination[0] not in rs.keys():
    rs[combination[0]] = {}
    rs[combination[0]][combination[1]] = results

with open(fname, "wb") as f:
    pickle.dump(rs, f)
    print("finished", combination)

Load EN from None
doing ./data/depression-sample - ./data/suicidewatch-sample
choosing min from 857 555
got 1110 records...training

Total documents classified: 1110

Score: 0.6666033350411994

Precision: 0.656450902569578

Recall: 0.6815438751526721

Confusion matrix:
[[378 177]
 [200 355]]
got 1110 records...training

Total documents classified: 1110

Score: 0.6387828445939984

Precision: 0.7090247101204192

Recall: 0.6045402386023842

Confusion matrix:
[[331 224]
 [149 406]]
got 1110 records...training

Total documents classified: 1110

Score: 0.651782599662758

Precision: 0.6586455003331314

Recall: 0.6522876565584951

Confusion matrix:
[[361 194]
 [190 365]]
got 1110 records...training

Total documents classified: 1110

Score: 0.6468897765818583

Precision: 0.6856096438271077

Recall: 0.6251648795116357

Confusion matrix:
[[346 209]
 [168 387]]
got 1110 records...training

Total documents classified: 1110

Score: 0.6201117605077056

Precision: 0.6828630668835958

Recall: 0.5940381

With reference to BagOfWords:

Character n-gram Language Models Language models are frequently employed to estimate the likelihood of a given sequence of words. This is done often by examining a
moving window of n words (n-gram). Traditionally each word is treated as a token, but
previous work indicates that treating each character as a token creates classifiers that capture some of the creative language use and emoticons frequently found on social media and
are afforded a modicum of robustness to misspellings [McNamee and Mayfield, 2004,Coppersmith et al., 2015a].

https://qntfy.com/static/papers/jsm2015.pdf


In this example we are using words as a token set in Pipeline: </br>
CountVectorizer - converts collection of text docs to a matrix of tokens counts - using the specified ngramrange </br>
Analyzer - set to "word"