In [None]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
import pandas
import os
import itertools
import pickle

# import content

In [None]:
# !python3 -m spacy download en_core_web_sm
import spacy


# Load English tokenizer, tagger, parser and NER
nlp = spacy.load("en_core_web_sm")

Content.py

In [None]:
PoS = ["NN", "PRP", "IN", "DT", "RB", "JJ", "VB", "CC", "NNS", "VBP"]
FirstPersonPronouns = [
    "i",
    "me",
    "mine",
    "my",
    "myself",
    "our",
    "ours",
    "ourselves",
    "us",
    "we",
]

if os.environ.get("SPACY_DATA"):
    data_dir = os.environ.get("SPACY_DATA")
else:
    data_dir = None
print("Load EN from %s" % data_dir)

# from spacy.en import English
# nlp = English(data_dir=data_dir)


def getTextFromRecord(row):
    if not pandas.isnull(row["body"]):
        rs = row["body"]
    else:
        rs = row["selftext"]
    if rs == "[deleted]":
        return None
    return rs


def addLexicalFeatures(df):
    from collections import Counter

    df["text"] = df.apply(getTextFromRecord, axis=1)

    def cosineSimilarity(sentence1, sentence2, NounsOrPronounesOnly=False):
        import math

        if NounsOrPronounesOnly is False:
            vector1 = Counter([token.text.lower() for token in sentence1])
            vector2 = Counter([token.text.lower() for token in sentence2])
        else:
            vector1 = Counter(
                [
                    token.text.lower()
                    for token in sentence1
                    if token.tag_.startswith("N") or token.tag_.startswith("PR")
                ]
            )
            vector2 = Counter(
                [
                    token.text.lower()
                    for token in sentence2
                    if token.tag_.startswith("N") or token.tag_.startswith("PR")
                ]
            )
        intersection = set(vector1.keys()) & set(vector2.keys())
        numerator = sum([vector1[x] * vector2[x] for x in intersection])
        sum1 = sum([vector1[x] ** 2 for x in vector1.keys()])
        sum2 = sum([vector2[x] ** 2 for x in vector2.keys()])
        denominator = math.sqrt(sum1) * math.sqrt(sum2)
        if not denominator:
            return 0.0
        else:
            return float(numerator) / denominator

    def getDocumentSimilarity(text, NounsOrPronounesOnly=False):
        doc = nlp(unicode(text))
        sentences = list(doc.sents)
        if len(sentences) < 2:
            return None
        i = 0
        rs = 0
        for i in range(0, len(sentences) - 1):
            rs += cosineSimilarity(sentences[i], sentences[i + 1], NounsOrPronounesOnly)
        rs = rs / float(len(sentences) - 1)
        return rs

    def getPronounsCounter(text):
        doc = nlp(unicode(text))
        sentences = list(doc.sents)
        from collections import Counter

        pronouns = []
        for sentence in sentences:
            pronouns.extend(
                [token.text for token in sentence if token.tag_.startswith("PRP")]
            )

        pronouns = Counter(pronouns)
        pronounsNo = np.sum(pronouns.values())
        return pandas.Series(
            {
                "sentencesNo": len(sentences),
                "pronouns": pronouns,
                "pronounsNo": pronounsNo,
            }
        )

    def getDefiniteArticlesCounter(text):
        doc = nlp(unicode(text))
        sentences = list(doc.sents)
        from collections import Counter

        definiteArticles = []
        for sentence in sentences:
            definiteArticles.extend(
                [
                    token.text
                    for token in sentence
                    if (token.tag_ == "DT" and token.text.lower() == "the")
                ]
            )

        definiteArticles = Counter(definiteArticles)
        definiteArticlesNo = np.sum(definiteArticles.values())
        return pandas.Series({"definiteArticlesNo": definiteArticlesNo})

    def getFirstPersonPronounsCounter(counter):
        l = [k for k in counter.keys() if k.lower() in FirstPersonPronouns]
        rs = {}
        for k in l:
            rs[k] = counter[k]
        firstPersonPronouns = Counter(rs)
        firstPersonPronounsNo = np.sum(firstPersonPronouns.values())
        return pandas.Series(
            {
                "firstPersonPronouns": firstPersonPronouns,
                "firstPersonPronounsNo": firstPersonPronounsNo,
            }
        )

    df["documentSimilarity"] = df["text"].apply(getDocumentSimilarity)
    df["documentSimilarityNounsOrPronouns"] = df["text"].apply(
        getDocumentSimilarity, args=(True,)
    )
    df[["pronouns", "pronounsNo", "sentencesNo"]] = df["text"].apply(getPronounsCounter)
    df[["definiteArticlesNo"]] = df["text"].apply(getDefiniteArticlesCounter)
    df[["firstPersonPronouns", "firstPersonPronounsNo"]] = df["pronouns"].apply(
        getFirstPersonPronounsCounter
    )
    df["firstPersonPronounsRatio"] = df["firstPersonPronounsNo"] / df[
        "pronounsNo"
    ].astype(float)
    return df


# def getIndications(df):
# 	indications = pd.read_csv("meddra_all_label_indications.tsv", delimiter='\t', header=None)
# 	indications = set(indications[8].tolist())
# 	indications = " ".join(indications)
# 	def textsOverlap(text, indications):
# 		rs = False
# 		tokens=  text.text.split(" ")
# 		for token in tokens:
# 			if (" " + token + " ") in indications:
# 				return True
# 		return False

# 	from collections import Counter
# 	df['text'] = df.apply(getTextFromRecord,axis=1)
# 	rs = Counter()
# 	for row in df[['text']].itertuples():
# 		text = row[1]
# 		doc = nlp(unicode(text))
# 		for sentence in list(doc.sents):
# 			nouns = [t for t in sentence  if t.pos_=='NOUN']
# 			rs += Counter([t.text.lower() for t in nouns if textsOverlap(t, indications)])
# 		# rs += Counter([t.text.lower() for t in list(doc.noun_chunks) if textsOverlap(t, indications)])
# 	return rs


def getSyntacticFeatures(row):
    # http://stackoverflow.com/questions/33289820/noun-phrases-with-spacy
    text = getTextFromRecord(row)

    def getHeightToken(token):
        height = 1
        while token != token.head:
            height += 1
            token = token.head
        return height

    def getVerbPhrasesLength(sentence):
        rs = [0]
        inVerb = False
        for token in sentence:
            if token.pos_ == "VERB":
                if not inVerb:
                    i = 1
                inVerb = True
                i += 1
            if token.pos_ != "VERB":
                if inVerb:
                    rs.append(i - 1)
                inVerb = False
        return rs

    if text is None:
        return pandas.Series(
            {
                "maxHeight": np.nan,
                "noun_chunks": np.nan,
                "maxVerbPhraseLength": np.nan,
                "subordinateConjuctions": np.nan,
            },
            dtype=object,
        )
    doc = nlp(unicode(text))
    noun_chunks = len(list(doc.noun_chunks))
    sentences = list(doc.sents)
    maxHeight = 1

    subordinateConjuctions = 0
    for sentence in sentences:
        subordinateConjuctions += len(
            [token for token in sentence if token.tag_ == "IN"]
        )
    subordinateConjuctions = subordinateConjuctions / float(len(sentences))

    for sentence in sentences:
        height = 1
        if sentence.end == 0:
            continue
        sentenceHeight = max([getHeightToken(token) for token in sentence])
        if maxHeight < sentenceHeight:
            maxHeight = sentenceHeight

    maxVerbPhraseLength = max(
        max([getVerbPhrasesLength(sentence) for sentence in sentences])
    )
    return pandas.Series(
        {
            "maxHeight": maxHeight - 1,
            "noun_chunks": noun_chunks,
            "maxVerbPhraseLength": maxVerbPhraseLength,
            "subordinateConjuctions": subordinateConjuctions,
        },
        dtype=object,
    )


def getLanguageFeatures(df):
    from collections import Counter
    import textblob
    import readability

    comments = df[df["parent_id"].astype(str) != "nan"]
    posts = df[df["parent_id"].astype(str) == "nan"]
    rs = Counter()
    # sentenceCount = 0
    # wordsCount = 0
    # wordsCounter = Counter()
    for t in comments[["body"]].itertuples():
        o = textblob.TextBlob(t[1])
        rs = rs + Counter([t[1] for t in o.pos_tags])
        # sentenceCount = sentenceCount + len(o.raw_sentences)
    for t in posts[["selftext"]].itertuples():
        o = textblob.TextBlob(t[1])
        rs = rs + Counter([t[1] for t in o.pos_tags])
        # sentenceCount = sentenceCount + len(o.raw_sentences)

    # rs[wordsCounter] = np.mean(wordsCounter.values())/float(np.sum(wordsCounter.values()))
    # rs['numSentences'] = sentenceCount/float(len(df))

    total = float(np.sum(rs.values()))
    rs1 = {k: (100 * rs[k]) / total for k in rs.keys()}
    rs = pandas.DataFrame(rs1.items())
    rs.columns = ["PoS", "frequency"]
    rs = rs.sort("frequency", ascending=False)
    rs = rs[rs["PoS"].isin(PoS)]
    d = rs.set_index("PoS")["frequency"].to_dict()
    d1 = {"PoS-" + k: d[k] for k in d.keys()}

    df = readability.prepare(df)
    df = readability.updateVocabularyFeatures(df)
    df1["ll"] = df["ll"].mean()
    return d1


def addSyntacticFeatures(df):
    df[
        ["maxHeight", "noun_chunks", "maxVerbPhraseLength", "subordinateConjuctions"]
    ] = df.apply(getSyntacticFeatures, axis=1)
    return df


def getURLtoPostRatio(df):
    def containedInURL(row):
        return row["permalink"] not in row["url"]

    posts = df[df["parent_id"].astype(str) == "nan"]
    cnt = len(posts)
    urls = len(posts[posts.apply(containedInURL, axis=1)])
    return urls / float(cnt)

ml.py

In [None]:
def getData(df1, df2):
    import readability

    df1 = readability.prepare(df1)
    df2 = readability.prepare(df2)

    if "text" not in df1.columns:
        df1["text"] = df1.apply(getTextFromRecord, axis=1)
    if "text" not in df2.columns:
        df2["text"] = df2.apply(getTextFromRecord, axis=1)
    df1["class"] = "class1"
    df2["class"] = "class2"

    df1 = df1[["text", "class"]]
    df2 = df2[["text", "class"]]
    data = pandas.concat([df1, df2])
    data = data.reset_index()
    del data["index"]
    data = data.reindex(np.random.permutation(data.index))
    return data


def bagOfWords(ngram_range=(2, 2)):
    pipeline = Pipeline(
        [
            (
                "count_vectorizer",
                CountVectorizer(ngram_range=ngram_range, analyzer="word"),
            ),
            ("classifier", MultinomialNB()),
        ]
    )
    return pipeline


def evaluate(pipeline, data=None, getScore=False):
    k_fold = KFold(n=len(data), n_folds=6)
    scores = []
    confusion = np.array([[0, 0], [0, 0]])
    precision = 0
    recall = 0
    for train_indices, test_indices in k_fold:
        train_text = data.iloc[train_indices]["text"].values
        train_y = data.iloc[train_indices]["class"].values.astype(str)

        test_text = data.iloc[test_indices]["text"].values
        test_y = data.iloc[test_indices]["class"].values.astype(str)

        pipeline.fit(train_text, train_y)
        predictions = pipeline.predict(test_text)

        confusion += confusion_matrix(test_y, predictions)
        score = f1_score(test_y, predictions, pos_label="class1")
        precision += precision_score(test_y, predictions, pos_label="class1")
        recall += recall_score(test_y, predictions, pos_label="class1")
        scores.append(score)

    print("Total documents classified:", len(data))
    print("Score:", sum(scores) / len(scores))
    print("Precision:", precision / len(scores))
    print("Recall:", recall / len(scores))
    print("Confusion matrix:")
    print(confusion)
    if getScore:
        return pipeline, sum(scores) / float(len(scores))
    return pipeline

In [None]:
# print """USAGE:
# get data as:
# [['text', 'class']]

# pipeline = bagOfWords()
# pipeline = evaluate(pipeline, data=data)

binaryClassification.py

In [None]:
# AD: https://github.com/imankulov/pickle-compat is an example of a compatibility layer.
# in command lineline:
# python -m pickle_compat.patch()
# pickle.load suicidewatch-sample.pickle > data.tmp
# python -m pickle.dump data.tmp > new_suicidewatch-sample.pickle -p pickle.HIGHEST_PROTOCOL
# Select the lowest common protocols from py2 to py3: https://docs.python.org/3/library/pickle.html#data-stream-format

subreddits = ["suicidewatch-sample", "depression-sample"]
subreddits.sort()

with open("suicidewatch-sample.pkl", "rb") as f:
    # Load the object from the file

    object = pickle.load(f)

object

In [11]:
fname = "combinations-10fold.pickle"
combinations = list(itertools.combinations(subreddits, 2))

for combination in combinations:
    if os.path.isfile(fname):
        f = open(fname, "r")
        rs = pickle.load(f)
        f.close()
    else:
        rs = {}

    if combination[0] in rs.keys():
        if combination[1] in rs[combination[0]].keys():
            print(combination, "already exists, skipping...")
            continue

    print("doing", combination[0], "-", combination[1])
    df1 = pandas.read_pickle(combination[0] + ".pickle")
    df1 = df1.reset_index()
    df1["text"] = df1.apply(getTextFromRecord, axis=1)
    df1 = df1.reindex(np.random.permutation(df1.index))
    df2 = pandas.read_pickle(combination[1] + ".pickle")
    df2 = df2.reset_index()
    df2["text"] = df2.apply(getTextFromRecord, axis=1)
    df2 = df2.reindex(np.random.permutation(df2.index))
    # keep only posts, keep only the text column
    df1 = df1[df1["parent_id"].astype(str) == "nan"]
    df1 = df1.dropna(subset=["text"])
    df2 = df2[df2["parent_id"].astype(str) == "nan"]
    df2 = df2.dropna(subset=["text"])

    results = []
    print("choosing min from", len(df1), len(df2))
    m = min(len(df1), len(df2))
    for i in range(0, 10):
        df1_min = df1.reindex(np.random.permutation(df1.index)).head(m)
        df2_min = df2.reindex(np.random.permutation(df2.index)).head(m)

        data = ml.getData(df1_min, df2_min)
        print("got", len(data), "records...training")
        pipeline = bagOfWords(ngram_range=ngram_range)
        pipeline, score = evaluate(pipeline, data=data, getScore=True)
        results.append(score)

    print("RESULTS:", combination, results)
    if combination[0] not in rs.keys():
        rs[combination[0]] = {}
    rs[combination[0]][combination[1]] = results

    f = open(fname, "w")
    pickle.dump(rs, f)
    f.close()
    print("finished", combination)


def readResults(deviations=False):
    df = pandas.read_pickle("combinations-10fold.pickle")
    df = pandas.DataFrame(df)
    for column in df.columns:
        for index in df.index:
            tmp = df[column][index]
            if tmp is None:
                continue
            if deviations:
                df[column][index] = np.std(tmp)
            else:
                df[column][index] = np.mean(tmp)
    return df

doing depression-sample - suicidewatch-sample


FileNotFoundError: [Errno 2] No such file or directory: 'depression-sample.pickle'