# Who Said the Line in Simpson's TV Series

In [1]:
import numpy as np
import pandas as pd
from time import time

from sklearn import metrics
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.extmath import density

import matplotlib.pyplot as plt

# Import TC algorithms
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
script_df = pd.read_csv('simpsons_script_lines.csv', error_bad_lines=False)
char_df = pd.read_csv('simpsons_characters.csv')

b'Skipping line 8084: expected 13 fields, saw 20\nSkipping line 52607: expected 13 fields, saw 21\nSkipping line 59910: expected 13 fields, saw 21\n'
b'Skipping line 71801: expected 13 fields, saw 20\nSkipping line 73539: expected 13 fields, saw 21\nSkipping line 77230: expected 13 fields, saw 21\nSkipping line 78953: expected 13 fields, saw 21\nSkipping line 81138: expected 13 fields, saw 20\nSkipping line 86746: expected 13 fields, saw 22\nSkipping line 101154: expected 13 fields, saw 21\nSkipping line 115438: expected 13 fields, saw 20\nSkipping line 117573: expected 13 fields, saw 22\nSkipping line 130610: expected 13 fields, saw 22\n'
b'Skipping line 152970: expected 13 fields, saw 22\nSkipping line 153017: expected 13 fields, saw 20\nSkipping line 153018: expected 13 fields, saw 30\nSkipping line 154080: expected 13 fields, saw 20\nSkipping line 154082: expected 13 fields, saw 20\nSkipping line 154084: expected 13 fields, saw 20\nSkipping line 154086: expected 13 fields, saw 20\n

# 1. Simple Data Preprocessing

In [3]:
script_df.head()

Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31.0
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3.0
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22.0
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5.0
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33.0


In [4]:
char_df.head()

Unnamed: 0,id,name,normalized_name,gender
0,7,Children,children,
1,12,Mechanical Santa,mechanical santa,
2,13,Tattoo Man,tattoo man,
3,16,DOCTOR ZITSOFSKY,doctor zitsofsky,
4,20,Students,students,


### We just need dialogue and character to do text classification

In [3]:
dial_df = (
    script_df.dropna()
    .loc[script_df['speaking_line'] == True]
    [['character_id', 'normalized_text']]
)
dial_df.head()

Unnamed: 0,character_id,normalized_text
0,464,no actually it was a little of both sometimes ...
1,9,wheres mr bergstrom
2,464,i dont know although id sure like to talk to h...
3,9,that life is worth living
4,40,the polls will be open from now until the end ...


### We wanna know the top 14 characters who speaked the most among all the series

In [4]:
count = dial_df['character_id'].value_counts()
count = count.to_frame()
count = count.reset_index()
count.rename(columns={'index':'id', 'character_id':'occurrence'}, inplace=True)
#count['id'] = count['id'].apply(int)
count.head(14)

Unnamed: 0,id,occurrence
0,2.0,23011
1,1.0,10750
2,8.0,10591
3,9.0,9078
4,15.0,2498
5,17.0,2342
6,3.0,2044
7,11.0,1793
8,71.0,1622
9,25.0,1480


### Given the character list in the DS & ML class, we create our own one to compare with it.
Here is the list below:
1. Homer Simpson
2. Marge Simpson
3. Bart Simpson
4. Lisa Simpson
5. C. Montgomery Burns
6. Moe Szyslak
7. Seymour Skinner
8. Ned Flanders
9. Grampa Simpson
10. Chief Wiggum
11. Milhouse Van Houten
12. Krusty the Clown
13. Nelson Muntz
14. Lenny Leonard

In [5]:
result = pd.merge(char_df[['id', 'name']], count, on='id')
result = result.sort_values('occurrence', ascending=False)
result.head(14)

Unnamed: 0,id,name,occurrence
5300,2,Homer Simpson,23011
27,1,Marge Simpson,10750
28,8,Bart Simpson,10591
29,9,Lisa Simpson,9078
5301,15,C. Montgomery Burns,2498
30,17,Moe Szyslak,2342
31,3,Seymour Skinner,2044
32,11,Ned Flanders,1793
34,71,Chief Wiggum,1622
35,25,Milhouse Van Houten,1480


In [24]:
def corpus_creator(cid):
    line = '' 
    for i in dial_df['normalized_text'][dial_df['character_id']==cid]:
        line = line + i
    return line

corpus_df = pd.DataFrame()
corpus_df['character_id'] = list(dial_df['character_id'].value_counts().index)

temp = []
for i in corpus_df['character_id']:
    temp.append(corpus_creator(i))

corpus_df['normalized_text'] = temp

corpus_df

Unnamed: 0,character_id,normalized_text
0,2.0,never thrown a party what about that big bash ...
1,1.0,lisa tell your fatherhomer you are not allowed...
2,8.0,victory party under the slidehey thanks for yo...
3,9.0,wheres mr bergstromthat life is worth livingmr...
4,15.0,must turn over got to greet dignitariesabsolut...
5,17.0,college boymoes tavern where the elite meet to...
6,3.0,dont worry bart well find something fun for yo...
7,11.0,hey anybody mind if i serve as bartender you k...
8,71.0,hellowell its about time somebody reach out to...
9,25.0,uh ohwhat about you bart didnt you votebarts j...


# 2. Feature Extraction

## 2.1. Select first 4 characters
> choose either 4 or 14 characters

In [None]:
dial_df = dial_df.loc[dial_df['character_id'].isin(count['id'][:4])]
target_names = result['name'][:4]
dial_df

## 2.2. Select first 14 characters

In [6]:
# Subset first 14 characters from the original script_df
dial_df = dial_df.loc[dial_df['character_id'].isin(count['id'][:14])]
target_names = result['name'][:14]
dial_df.head()

Unnamed: 0,character_id,normalized_text
1,9,wheres mr bergstrom
3,9,that life is worth living
7,8,victory party under the slide
9,9,mr bergstrom mr bergstrom
11,9,do you know where i could find him


### Convert raw text to TF-IDF vecter space

In [7]:
X = dial_df["normalized_text"]
vectorizer = TfidfVectorizer(ngram_range=(1, 2)).fit(X)
print(len(vectorizer.vocabulary_))
feature_names = vectorizer.get_feature_names()
if feature_names:
    feature_names = np.asarray(feature_names)
y = dial_df['character_id'].astype(int)
X = vectorizer.transform(X)

254873


In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

## 3. Define Benchmark function and training process

In [9]:
def benchmark(clf):
    print('_' * 80)
    print("Training: ")
    print(clf)
    t0 = time()
    clf.fit(X_train, y_train)
    train_time = time() - t0
    print("train time: %0.3fs" % train_time)

    t0 = time()
    pred = clf.predict(X_test)
    test_time = time() - t0
    print("test time:  %0.3fs" % test_time)
    print('_' * 80)
    
    if hasattr(clf, 'coef_'):
        print("dimensionality: %d" % clf.coef_.shape[1])
        print("density: %f" % density(clf.coef_))

        if feature_names is not None:
            print("top 10 keywords per class:")
            for i, label in enumerate(target_names):
                top10 = np.argsort(clf.coef_[i])[-10:]
                print("%s: %s" % (label, " ".join(feature_names[top10])))
        print()
    
    print('_' * 80)
    print(clf)
    print("classification report:")
    score = metrics.accuracy_score(y_test, pred)
    print("accuracy:   %0.3f" % score)
    print(metrics.classification_report(y_test, pred, target_names=target_names))

### Direct Prediction, too bad

In [25]:
benchmark(LogisticRegression())

________________________________________________________________________________
Training: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
train time: 6.523s
test time:  0.009s
________________________________________________________________________________
dimensionality: 101949
density: 1.000000
top 10 keywords per class:
Homer Simpson: deflate your de corn cheese cubes doodletown either sign imagination can dokes you chosen you charlie bluhdorn checkin
Marge Simpson: hug me crony answers answers globe cheese cubes important of booze ill gateway game cheap they deed goes
Bart Simpson: as witch glass into be female go thank friend flicka at bed for crime hostile do de believe every
Lisa Simpson: as christian civility frito lays bart cheated death get back pains cowell wo

In [60]:
benchmark(SGDClassifier())

________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)




train time: 0.911s
test time:  0.026s
________________________________________________________________________________
dimensionality: 218705
density: 0.215976
top 10 keywords per class:
Homer Simpson: homer did shoo homer stop oh homie artie homer dont oh homer husband homer homie
Marge Simpson: yello moe sweetie lenny stupid woo honey flanders son marge
Bart Simpson: good lord adeleine nam johnny detention students edna chalmers superintendent chalmers superintendent
Lisa Simpson: bart simpson aye carumba ay carumba hey lis im bart hey dad cool carumba milhouse lis
C. Montgomery Burns: bart thats dad dad did oh dad malibu dad im da ad dad thats yayyy mom
Moe Szyslak: okily okily dokily doodle hidilly howdilly doodily neighbor maude diddily diddly
Seymour Skinner: youre fired ahoy humbug hounds ahoy hoy bobo hoy huzzah excellent smithers
Ned Flanders: hey ya harv matter homer barn oh hey tab tavern sorry homer midge aint
Chief Wiggum: glasses ravenous playmates playdude playmates umpi

### Feature Selection Technique[optional]

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [12]:
ch2 = SelectPercentile(chi2, percentile=40)
X_train = ch2.fit_transform(X_train, y_train)
X_test = ch2.transform(X_test)

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [35]:
benchmark(LogisticRegression())

________________________________________________________________________________
Training: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
train time: 5.296s
test time:  0.010s
________________________________________________________________________________
dimensionality: 27557
density: 0.844831
top 10 keywords per class:
Homer Simpson: sweetie kids dear ned fathers honey artie husband homer homie
Marge Simpson: suckers wife mmmm yello lousy stupid flanders son woo marge
Bart Simpson: simpson detention student children school willie superintendent students mother edna
Lisa Simpson: awesome man dad bike carumba cool sister krusty milhouse lis
C. Montgomery Burns: pony snowball ad yayyy dads bart buddhist malibu dad mom
Moe Szyslak: folks christian boys ho diddly lord reve

### NLTK preprocessing[optional]

In [102]:
import nltk
from nltk.stem import SnowballStemmer

nltk.download('stopwords')

stemmer = SnowballStemmer(language='english', ignore_stopwords=True)
dial_nltk_df = dial_df.copy()  # prevent objects changing from each other
dial_nltk_df['normalized_text'] = dial_df['normalized_text'].apply(stemmer.stem)
dial_nltk_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\h164654156465\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,character_id,normalized_text
1,9,wheres mr bergstrom
3,9,that life is worth liv
7,8,victory party under the slid
9,9,mr bergstrom mr bergstrom
11,9,do you know where i could find him


In [103]:
y = dial_nltk_df['character_id'].astype(int)
X = dial_nltk_df['normalized_text']
X.shape

(70062,)

In [104]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 2)).fit(X)
print(len(vectorizer.vocabulary_))
feature_names = vectorizer.get_feature_names()
if feature_names:
    feature_names = np.asarray(feature_names)
X = vectorizer.transform(X)

224616


In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [56]:
benchmark(SGDClassifier())

________________________________________________________________________________
Training: 
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)




train time: 0.300s
test time:  0.010s
________________________________________________________________________________
dimensionality: 167096
density: 0.568823
top 10 keywords per class:
Homer Simpson: marge simpson hom homer dont oh homie oh dear artie husband homi homer homie
Marge Simpson: honey wife moe eh stupid flanders boy marg son marge
Bart Simpson: radioactive man im bart bart simpson carumba awesom cool whoa milhous milhouse lis
Lisa Simpson: dad think buddhist dad did da ad oh dad yay dad mom yayyy dad thats

________________________________________________________________________________
SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
classification report:
accuracy:   0.509
               precision   

In [55]:
benchmark(LogisticRegression())

________________________________________________________________________________
Training: 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
train time: 2.607s
test time:  0.004s
________________________________________________________________________________
dimensionality: 167096
density: 0.743985
top 10 keywords per class:
Homer Simpson: house hom ned father going marge simpson husband homi homer homie
Marge Simpson: boy moe stupid woo wife flanders homer simpson marg son marge
Bart Simpson: man carumba im bart milhous cool krusty whoa bart simpson milhouse lis
Lisa Simpson: dad im buddhist bart dad dont mr flanders im lisa yayyy dad thats dad mom

________________________________________________________________________________
LogisticRegression(C=1.0, class_weight=None

### Feel so bad, it's a disaster. 
But wait, we have this ->
- https://www.kaggle.com/ambarish/fun-in-text-mining-with-simpsons
- https://www.kaggle.com/thebrownviking20/who-said-this-line-eda-classification-keras-ann
- https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

### Try XGBoost

In [17]:
xgb = XGBClassifier(objective="multi:softmax", tree_method="gpu_exact", num_class=14, max_depth=5, predictor="cpu_predictor")
#random_search = RandomizedSearchCV(xgb, param_distributions=params)

In [19]:
print(xgb)
t0 = time()
xgb.fit(X_train, y_train)
train_time = time() - t0
print("train time: %0.3fs" % train_time)

train time: 590.449s


In [20]:
print(xgb)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=5, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, num_class=14, objective='multi:softprob',
       predictor='cpu_predictor', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
       subsample=1, tree_method='gpu_exact')


In [23]:
pred = xgb.predict(X_test)
print("classification report:")
score = metrics.accuracy_score(y_test, pred)
print("accuracy:   %0.3f" % score)
print(metrics.classification_report(y_test, pred, target_names=target_names))

classification report:
accuracy:   0.400
                     precision    recall  f1-score   support

      Homer Simpson       0.63      0.18      0.28      3201
      Marge Simpson       0.37      0.95      0.53      6917
       Bart Simpson       0.56      0.09      0.15       606
       Lisa Simpson       0.58      0.12      0.20      3181
C. Montgomery Burns       0.49      0.17      0.26      2706
        Moe Szyslak       0.67      0.08      0.14       511
    Seymour Skinner       0.77      0.15      0.25       778
       Ned Flanders       0.50      0.07      0.12       728
       Chief Wiggum       0.53      0.02      0.04       452
Milhouse Van Houten       0.50      0.00      0.01       428
   Krusty the Clown       0.66      0.10      0.18       441
     Grampa Simpson       0.89      0.10      0.17       321
       Nelson Muntz       0.50      0.04      0.07       456
      Lenny Leonard       0.44      0.05      0.09       293

        avg / total       0.51      0.40  

  if diff:


### Try simple neural network
Check if we have gpu support

In [None]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

In [7]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [8]:
dial_df.head()

Unnamed: 0,character_id,normalized_text
1,9,wheres mr bergstrom
3,9,that life is worth living
7,8,victory party under the slide
9,9,mr bergstrom mr bergstrom
11,9,do you know where i could find him


In [91]:
y = dial_df['character_id'].astype(int)
X = dial_df['normalized_text']

In [92]:
vectorizer = TfidfVectorizer().fit(X)
print(len(vectorizer.vocabulary_))
feature_names = vectorizer.get_feature_names()
if feature_names:
    feature_names = np.asarray(feature_names)
X = vectorizer.transform(X)

27557


In [93]:
# Encoding categorical data using label encoding and one-hot encoding 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(y)
y = y.reshape(-1, 1)

onehotencoder = OneHotEncoder(categorical_features = [0])
y = onehotencoder.fit_transform(y).toarray()
y.shape

(70062, 14)

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [95]:
max_words = 27557
num_classes = 14

model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(256, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(128, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [97]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [98]:
model.fit(X_train, y_train, epochs=10, batch_size=1024)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1d76600d2b0>

In [102]:
pred = model.predict_classes(X_test)

### Convert features back to each class

In [104]:
y_test = np.argmax(y_test, axis=1)
y_test

array([1, 1, 4, ..., 3, 1, 7], dtype=int64)

In [105]:
print("classification report:")
print("accuracy:   %0.3f" % metrics.accuracy_score(y_test, pred))
print(metrics.classification_report(y_test, pred, target_names=target_names))

classification report:
accuracy:   0.349
                     precision    recall  f1-score   support

      Homer Simpson       0.36      0.38      0.37      3200
      Marge Simpson       0.47      0.54      0.50      7009
       Bart Simpson       0.11      0.17      0.13       633
       Lisa Simpson       0.29      0.30      0.30      3136
C. Montgomery Burns       0.31      0.29      0.30      2637
        Moe Szyslak       0.23      0.21      0.22       543
    Seymour Skinner       0.34      0.27      0.30       734
       Ned Flanders       0.19      0.17      0.18       720
       Chief Wiggum       0.07      0.02      0.03       477
Milhouse Van Houten       0.07      0.03      0.04       428
   Krusty the Clown       0.16      0.07      0.10       494
     Grampa Simpson       0.23      0.15      0.18       284
       Nelson Muntz       0.06      0.07      0.06       449
      Lenny Leonard       0.00      0.00      0.00       275

        avg / total       0.33      0.35  

  'precision', 'predicted', average, warn_for)


### Dealing with imbalanced data

In [147]:
import nltk
from nltk.stem import SnowballStemmer

nltk.download('stopwords')

stemmer = SnowballStemmer(language='english', ignore_stopwords=True)
dial_nltk_df = dial_df.copy()  # prevent objects changing from each other
dial_nltk_df['normalized_text'] = dial_df['normalized_text'].apply(stemmer.stem)
dial_nltk_df.head()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\h164654156465\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,character_id,normalized_text
1,9,wheres mr bergstrom
3,9,that life is worth liv
7,8,victory party under the slid
9,9,mr bergstrom mr bergstrom
11,9,do you know where i could find him


In [148]:
y = dial_nltk_df['character_id'].astype(int)
X = dial_nltk_df['normalized_text']
X.shape

(70062,)

In [149]:
vectorizer = TfidfVectorizer(stop_words='english', min_df=2).fit(X)
print(len(vectorizer.vocabulary_))
feature_names = vectorizer.get_feature_names()
if feature_names:
    feature_names = np.asarray(feature_names)
X = vectorizer.transform(X)

15046


In [150]:
class_weights = class_weight.compute_class_weight('balanced', np.unique(y), y)
class_weights

array([0.46552824, 0.21747984, 2.44835057, 0.4725171 , 0.55126995,
       2.79109234, 2.00337413, 2.13681835, 3.38137066, 3.54170458,
       3.08534437, 4.89670115, 3.38594626, 5.32386018])

In [151]:
# Encoding categorical data using label encoding and one-hot encoding 
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

labelencoder_X_1 = LabelEncoder()
y = labelencoder_X_1.fit_transform(y)
y = y.reshape(-1, 1)

onehotencoder = OneHotEncoder(categorical_features = [0])
y = onehotencoder.fit_transform(y).toarray()
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [153]:
max_words = 15046
num_classes = 14

model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(256, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.6))
model.add(Dense(256, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.3))
model.add(Dense(128, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(128, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.4))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

In [154]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [155]:
model.fit(X_train, y_train, epochs=20, batch_size=1024, class_weight=class_weights)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1d76cbf6d68>

In [156]:
pred = model.predict_classes(X_test)

In [157]:
y_test = np.argmax(y_test, axis=1)
y_test

array([5, 4, 0, ..., 4, 4, 0], dtype=int64)

In [158]:
print("classification report:")
print("accuracy:   %0.3f" % metrics.accuracy_score(y_test, pred))
print(metrics.classification_report(y_test, pred, target_names=target_names))

classification report:
accuracy:   0.311
                     precision    recall  f1-score   support

      Homer Simpson       0.35      0.36      0.35      3268
      Marge Simpson       0.45      0.51      0.48      6871
       Bart Simpson       0.04      0.03      0.03       611
       Lisa Simpson       0.30      0.24      0.27      3163
C. Montgomery Burns       0.29      0.27      0.28      2762
        Moe Szyslak       0.22      0.00      0.01       545
    Seymour Skinner       0.08      0.41      0.13       769
       Ned Flanders       0.04      0.02      0.02       664
       Chief Wiggum       0.00      0.00      0.00       453
Milhouse Van Houten       0.00      0.00      0.00       417
   Krusty the Clown       0.00      0.00      0.00       501
     Grampa Simpson       0.00      0.00      0.00       291
       Nelson Muntz       0.00      0.00      0.00       404
      Lenny Leonard       0.00      0.00      0.00       300

        avg / total       0.30      0.31  

  'precision', 'predicted', average, warn_for)
