### Building a Classifier

In [1]:
from sklearn.model_selection import KFold
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from collections import Counter

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [3]:
import numpy as np

In [4]:
with open("lemmas_children.txt", 'r', encoding = "utf-8-sig") as f:
    children = f.read()
with open("lemmas_lay.txt", 'r', encoding = "utf-8-sig") as f:
    lay = f.read()
with open("lemmas_prof.txt", 'r', encoding = "utf-8-sig") as f:
    prof = f.read() 

In [12]:
child_texts = []
child_hundred = []
child_labels = []
for ind,w in enumerate(children.split()):
    if ind % 100 == 0 and ind != 0:
        child_texts.append(' '.join(child_hundred))
        child_labels.append('child')
        child_hundred = []
    else:
        child_hundred.append(w)

In [13]:
child_texts[:2000]

['me friend hang together go shop get ice cream oreo money happy mother day mum love so much miss new zealand be great be really good place be re get country ve get find house get lot poisonous spider want get bite rat be amaze t see be as easy see be be love chase bee rat be amaze t see rat be as easy a be aspire b be believe c be confidence d be dream e be enjoy f be family g be greatness h be happy be inspire j be joy k be kindness l be love',
 'be mystery n be never o be p be precious q be question r be rely s be special t be try u be unstoppable v be vacate w be worship x be x-ray y be yolo z be zone drink-sucker enviromental-dislike pointless-plastic ocean-destroyer hi m piece paper write draw turn aeroplane be not too old play dress be not too old have teddy be not too old believe magic be not too old play fairy be not too old like unicorn be not too old chase rainbow be not too old sometimes get discourage m so small always',
 'fingerprint furniture wall hi be chair sit down re

In [14]:
lay_texts = []
lay_hundred = []
lay_labels = []
for ind,w in enumerate(lay.split()):
    if ind % 100 == 0 and ind != 0:
        lay_texts.append(' '.join(lay_hundred))
        lay_labels.append('lay')
        lay_hundred = []
    else:
        lay_hundred.append(w)

In [15]:
prof_texts = []
prof_hundred = []
prof_labels = []
for ind,w in enumerate(prof.split()):
    if ind % 100 == 0 and ind != 0:
        prof_texts.append(' '.join(prof_hundred))
        prof_labels.append('prof')
        prof_hundred = []
    else:
        prof_hundred.append(w)

In [16]:
all_labels = child_labels + lay_labels + prof_labels
all_texts = child_texts + lay_texts + prof_texts
all_texts = np.asarray(all_texts)
all_labels = np.asarray(all_labels)

In [10]:
kf = KFold(n_splits=3, shuffle = True)
kf.get_n_splits(all_texts)
print(kf)

KFold(n_splits=3, random_state=None, shuffle=True)


In [20]:
for train_index, test_index in kf.split(all_texts):
    all_texts_train, all_texts_test = all_texts[train_index], all_texts[test_index]
    all_labels_train, all_labels_test = all_labels[train_index], all_labels[test_index]
    vect = TfidfVectorizer(min_df=8, max_df=0.6, stop_words = 'english')
    tfidf_train = vect.fit_transform(all_texts_train)
    tfidf_test = vect.transform(all_texts_test)
    classifier = MultinomialNB()
    classifier.fit(tfidf_train, all_labels_train)
    predicted_labels = classifier.predict(tfidf_test)
    classif_report = classification_report(all_labels_test, predicted_labels)
    print(classif_report)



             precision    recall  f1-score   support

      child       0.90      0.91      0.91       138
        lay       0.86      0.80      0.83       124
       prof       0.83      0.87      0.85       127

avg / total       0.86      0.86      0.86       389

             precision    recall  f1-score   support

      child       0.91      0.93      0.92       148
        lay       0.82      0.78      0.80       106
       prof       0.82      0.84      0.83       135

avg / total       0.86      0.86      0.86       389

             precision    recall  f1-score   support

      child       0.92      0.96      0.94       136
        lay       0.85      0.82      0.84       114
       prof       0.87      0.86      0.86       139

avg / total       0.88      0.88      0.88       389

