In [1]:
import re
import sys
import string
import numpy as np
import pandas as pd
import seaborn as sns
from itertools import cycle
from nltk import word_tokenize
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.lancaster import LancasterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [42]:
path = 'hp-with-text.csv'

data = pd.read_csv(path, sep='\t')
data.head()

Unnamed: 0,name,house,text
0,Tom Riddle,Slytherin,"Voldemort was considered by many to be ""the mo..."
1,Harry Potter,Gryffindor,"Harry was an extremely brave, loyal, and selfl..."
2,Dolores Umbridge,Slytherin,Dolores Umbridge was nothing short of a sociop...
3,Horace Slughorn,Slytherin,Horace Slughorn was described as having a bumb...
4,Albus Dumbledore,Gryffindor,Considered to be the most powerful wizard of h...


In [43]:
data['text'][0]

'Voldemort was considered by many to be "the most evil wizard in hundreds and hundreds of years". His nature far exceeded that of any common evil doer and Dumbledore stated that he "went beyond normal evil" in the extent of his crimes. Hagrid claimed that while all Dark Wizards "go bad", Voldemort went "worse than worse". Indeed, Voldemort speedily developed into a power-obsessed megalomaniac of the worst kind and the worst of any known Dark Wizard, even Gellert Grindelwald, his predecessor. He was highly intelligent, as evidenced by his top performance at Hogwarts and his tremendous magical achievements, but his interests were narrowly focused on the usefulness of people, objects, and powers to his goals. His inability to see the larger picture and inattention to events, powers and human traits that were not immediately useful to him was a serious flaw that led to most of his setbacks and ultimately his downfall. After his first defeat, many (namely Hagrid, Dumbledore, Neville\'s gran

In [44]:
print(data[['name', 'house']].groupby('house').describe())

                                   name
house                                  
Gryffindor count                     42
           unique                    42
           top           Lily J. Potter
           freq                       1
Hufflepuff count                     11
           unique                    11
           top           Cedric Diggory
           freq                       1
Ravenclaw  count                     15
           unique                    15
           top        Anthony Goldstein
           freq                       1
Slytherin  count                     27
           unique                    27
           top     Avery (Marauder-era)
           freq                       1


Ага, классы несбалансированы, тогда возьмём классификатор с class probability (напр. байес).

И надо посчитать эту probability

In [59]:
houses = sorted(set(data['house']))
probs = []

for h in houses:
    pr = len(data.loc[data['house']==h])/len(data)
    probs.append(round(pr, 3))

probs

[0.437, 0.113, 0.155, 0.296]

Теперь надо искать фичи.

Курс на сейчас: искать сущ/прил после X was a(n), сделать матрицу, обучить байеса

In [46]:
# делим на трэйн-тест, дальше только с трэйн
data, test = train_test_split(data)

In [48]:
chars = []
# регулярка для поиска характеристик:
#                             -обороты        длинные перечисления до конца предл  характеристики
charfinder = re.compile(' was(?:, [\\w ]+,)? ((?:(?:(?:\\w+, )+and [\\w\-]+\.)|(?:"?\\w+"?,? ?){,5}))')
# выражения типа in his idealistic nature
naturefinder = re.compile(' (h(?:is|er) (?:\\w+,? ?){,5} nature)')
print('start!')
for txt in data['text']:
    char = re.findall(charfinder, txt)
    nature = re.findall(naturefinder, txt)
    chars.append(char+nature)

chars[:3]

start!


[['outgoing, talkative, cheerful, and friendly.',
  'also among the first to ',
  'the Heir of Slytherin ',
  'a Parselmouth'],
 ['enthusiastic, bubbly, and easily able ',
  'eager and would often want ',
  'unhappy ',
  'rejected by Remus Lupin',
  'not good at household spells ',
  'also notoriously clumsy, smashing a ',
  'not only an excellent witch, ',
  'brave, dependable and selfless, having '],
 ['forced to repeat his final ']]

Нуу нормально.

Теперь можно убрать то, что мы знаем, что интенсификаторы и стоп-слова

In [75]:
intense = 'extremely highly very more phenomenally tremendously exceptionally especially presumably also \
somewhat mostly easily one many people often only able someone'.split()

stop = set(stopwords.words('english'))
stop = stop.difference(set(['no', 'not']))

# stops = re.compile('(?:\\W|^)(?:({}))\\W'.format(')|('.join(list(stop)+intense)))
stops = list(stop)+intense

chars = pd.Series(chars)
clean_chars = chars.map(lambda x: list(map(lambda y: ' '.join([i for i in y.lower().split() if i.strip() not in stops]), x)))
clean_chars[7:12]

7     [considered reckless people,, fountain fair, r...
8     [emotional fellow, seemed, taken chamber, show...
9     [described, tolerant accepting, forward thinki...
10               [prejudiced muggles, muggle, arrogant]
11    [convinced ernie, animagus, belief, flustered ...
dtype: object

Ну почему бы и нет. Можно поробовать так. Теперь сплит на всё что здесь есть, в словарь векторайзера, вот это всё

In [76]:
charwds = []
for ch in chars:
    charwds += list(map(lambda x: x.strip(string.punctuation), ' '.join(ch).split()))

print(len(charwds))
charwds = list(set(charwds))
print(len(charwds))

2882
984


In [77]:
# простая векторизация всего текста
cv = CountVectorizer()
clf = MultinomialNB()
X_train, y_train = cv.fit_transform(data['text']), data['house']
clf.fit(X_train, y_train)
X_test, y_test = cv.transform(test['text']), test['house']
praed = clf.predict(X_test)
print(classification_report(y_test, praed))

             precision    recall  f1-score   support

 Gryffindor       0.61      1.00      0.76        11
 Hufflepuff       0.00      0.00      0.00         3
  Ravenclaw       0.00      0.00      0.00         4
  Slytherin       0.83      0.83      0.83         6

avg / total       0.49      0.67      0.56        24



  'precision', 'predicted', average, warn_for)


In [78]:
# векторизация всего текста + вес классов
cv = CountVectorizer()
clf = MultinomialNB(class_prior=probs)
X_train, y_train = cv.fit_transform(data['text']), data['house']
clf.fit(X_train, y_train)
X_test, y_test = cv.transform(test['text']), test['house']
praed = clf.predict(X_test)
print(classification_report(y_test, praed))

             precision    recall  f1-score   support

 Gryffindor       0.61      1.00      0.76        11
 Hufflepuff       0.00      0.00      0.00         3
  Ravenclaw       0.00      0.00      0.00         4
  Slytherin       0.83      0.83      0.83         6

avg / total       0.49      0.67      0.56        24



  'precision', 'predicted', average, warn_for)


In [82]:
# векторизация по избранным словам + вес
cv = CountVectorizer(vocabulary=charwds)
clf = MultinomialNB(class_prior=probs)
X_train, y_train = cv.fit_transform(data['text']), data['house']
clf.fit(X_train, y_train)
X_test, y_test = cv.transform(test['text']), test['house']
praed = clf.predict(X_test)
print(classification_report(y_test, praed))
print(confusion_matrix(y_test, praed))

             precision    recall  f1-score   support

 Gryffindor       0.50      0.64      0.56        11
 Hufflepuff       0.00      0.00      0.00         3
  Ravenclaw       0.00      0.00      0.00         4
  Slytherin       0.40      0.67      0.50         6

avg / total       0.33      0.46      0.38        24

[[7 0 0 4]
 [1 0 0 2]
 [4 0 0 0]
 [2 0 0 4]]


  'precision', 'predicted', average, warn_for)


Не очень, конечно (и class_prior ничего не делает). Обидно.

In [81]:
cv = CountVectorizer(vocabulary=charwds)
forest = RandomForestClassifier(random_state=34)
params = {'n_estimators': range(10, 101, 10)}
clf = GridSearchCV(forest, params)
X_train, y_train = cv.fit_transform(data['text']), data['house']
clf.fit(X_train, y_train)
X_test, y_test = cv.transform(test['text']), test['house']
print('Best param set: {}'.format(clf.best_params_))
praed = clf.predict(X_test)
print(classification_report(y_test, praed))
print(confusion_matrix(y_test, praed))

Best param set: {'n_estimators': 70}
             precision    recall  f1-score   support

 Gryffindor       0.50      1.00      0.67        11
 Hufflepuff       0.00      0.00      0.00         3
  Ravenclaw       0.00      0.00      0.00         4
  Slytherin       1.00      0.33      0.50         6

avg / total       0.48      0.54      0.43        24

[[11  0  0  0]
 [ 3  0  0  0]
 [ 4  0  0  0]
 [ 4  0  0  2]]


  'precision', 'predicted', average, warn_for)


In [83]:
cv = CountVectorizer(vocabulary=charwds)
forest = DecisionTreeClassifier(random_state=34)
params = {'max_depth': range(10, 101, 10)}
clf = GridSearchCV(forest, params)
X_train, y_train = cv.fit_transform(data['text']), data['house']
clf.fit(X_train, y_train)
X_test, y_test = cv.transform(test['text']), test['house']
print('Best param set: {}'.format(clf.best_params_))
praed = clf.predict(X_test)
print(classification_report(y_test, praed))
print(confusion_matrix(y_test, praed))

Best param set: {'max_depth': 10}
             precision    recall  f1-score   support

 Gryffindor       0.47      0.73      0.57        11
 Hufflepuff       0.00      0.00      0.00         3
  Ravenclaw       0.50      0.50      0.50         4
  Slytherin       1.00      0.17      0.29         6

avg / total       0.55      0.46      0.42        24

[[8 1 2 0]
 [3 0 0 0]
 [2 0 2 0]
 [4 1 0 1]]


В целом от леса можно добиться более высокой ф-меры, и гриффиндорцев он определяет отлично (правда, и всех остальных тоже определяет как гриффиндорцев). Дерево тоже где-то рядом, и оно даже справляется с другими факультетами, ура! Но это всё равно меньше, чем даёт векторизация всего текста =/

In [84]:
import gensim

In [None]:
quasif = data['text']

In [88]:
wvec = gensim.models.word2vec.LineSentence(data['text'])

In [87]:
model = gensim.models.Word2Vec(data, size=500, window=10, min_count=2, sg=0)