### I. Necessary imports and functions to be used, Loading the data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
import sklearn.model_selection
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

import nltk
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')


from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D

#word2vec
# from gensim.models import Word2Vec  
# import gensim
# import string

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
#the path that the data is stored. Only for local use
PATH =''

Loading Data

In [4]:
simpsons = pd.read_csv(PATH + 'simpson_family_sentence_by_sentence.csv')
#simpsons.Character = simpsons.Character.map({'Homer Simpson' : 0, 'Marge Simpson' : 1, 'Lisa Simpson' : 2, 'Bart Simpson' : 3})
south_park = pd.read_csv(PATH + 'south_park_sentence_by_sentence.csv')

merged_series = simpsons.append(south_park)
merged_series.groupby('Character').count()

Unnamed: 0_level_0,Line
Character,Unnamed: 1_level_1
Bart Simpson,19486
Cartman,20593
Homer Simpson,46333
Kyle,11094
Lisa Simpson,15873
Marge Simpson,19475
Stan,11561


### Pre-processing steps ###

Define functions to apply stemmization, lemmatization and removing stop words from data set

In [5]:
stop_words = set(stopwords.words('english'))

ps = PorterStemmer()
lem = WordNetLemmatizer()

def remove_stop_words(line):
    result = ''
    for word in line.lower().split():
        if word not in stop_words:
            result = result + " " + word
    
    return result

def stemm_words(line):
    result = ''
    
    for word in line.split():
        result = result + " " + ps.stem(word)
    
    return result

def lemmatize_words(line):
    result = ''
    
    for word in line.split():
        result = result + ' ' + lem.lemmatize(word)
     
    return result


Work with the copy of the original data sets to have it as the reference

In [6]:
south_park_processed = south_park.copy()
simpsons_processed = simpsons.copy()
merged_processed = merged_series.copy()

Applyting all pre-processing steps to newly copied data sets

In [7]:
south_park_processed.Line = south_park_processed.Line.apply(remove_stop_words)
south_park_processed.Line = south_park_processed.Line.apply(stemm_words)
south_park_processed.Line = south_park_processed.Line.apply(lemmatize_words)

simpsons_processed.Line = simpsons_processed.Line.apply(remove_stop_words)
simpsons_processed.Line = simpsons_processed.Line.apply(stemm_words)
simpsons_processed.Line = simpsons_processed.Line.apply(lemmatize_words)

merged_processed.Line = merged_processed.Line.apply(remove_stop_words)
merged_processed.Line = merged_processed.Line.apply(stemm_words)
merged_processed.Line = merged_processed.Line.apply(lemmatize_words)


### Simple models for classification ###

Splitting data for training and test sets for each data set

In [8]:
lines_south = south_park_processed.Line.values
lines_simpsons = simpsons_processed.Line.values
lines_mer = merged_processed.Line.values

y_sp = south_park_processed.Character.values
y_sim = simpsons_processed.Character.values
y_mer = merged_processed.Character.values


lines_train_sp, lines_test_sp, y_train_sp, y_test_sp = train_test_split(lines_south, y_sp, test_size=0.25, random_state=21,stratify = y_sp)
lines_train_sim, lines_test_sim, y_train_sim, y_test_sim = train_test_split(lines_simpsons, y_sim, test_size=0.25, random_state=21,stratify = y_sim)
lines_train_mer, lines_test_mer, y_train_mer, y_test_mer = train_test_split(lines_mer, y_mer, test_size=0.25, random_state=21,stratify = y_mer)


vectorizer_sp = CountVectorizer()
X_train_sp =vectorizer_sp.fit_transform(lines_train_sp)
X_test_sp = vectorizer_sp.transform(lines_test_sp)


vectorizer_sim = CountVectorizer()
X_train_sim = vectorizer_sim.fit_transform(lines_train_sim)
X_test_sim  = vectorizer_sim.transform(lines_test_sim)

vectorizer_mer = CountVectorizer()
X_train_mer = vectorizer_mer.fit_transform(lines_train_mer)
X_test_mer  = vectorizer_mer.transform(lines_test_mer)

Defining a simple models.

Logistic Regression. Unbalanced data set. 

In [9]:
clf = LogisticRegression(solver='lbfgs', max_iter = 1000,class_weight = 'balanced', multi_class ='ovr')


clf.fit(X_train_sp,y_train_sp)
predicted = clf.predict(X_test_sp)
print('South Park 3-class classifier')
#print(metrics.confusion_matrix(y_test_sp, predicted))
print(metrics.classification_report(y_test_sp, predicted))


clf.fit(X_train_sim,y_train_sim)
predicted = clf.predict(X_test_sim)
print('Simpsons 4-class classifier')
#print(metrics.confusion_matrix(y_test_sim, predicted))
print(metrics.classification_report(y_test_sim, predicted))

clf.fit(X_train_mer,y_train_mer)
predicted = clf.predict(X_test_mer)
print('Both Series characters')
#print(metrics.confusion_matrix(y_test_mer, predicted))
print(metrics.classification_report(y_test_mer, predicted))

South Park 3-class classifier
              precision    recall  f1-score   support

     Cartman       0.64      0.55      0.60      5148
        Kyle       0.35      0.42      0.38      2774
        Stan       0.38      0.40      0.39      2890

   micro avg       0.48      0.48      0.48     10812
   macro avg       0.46      0.46      0.46     10812
weighted avg       0.50      0.48      0.49     10812

Simpsons 4-class classifier
               precision    recall  f1-score   support

 Bart Simpson       0.34      0.34      0.34      4872
Homer Simpson       0.59      0.50      0.54     11583
 Lisa Simpson       0.30      0.36      0.33      3968
Marge Simpson       0.37      0.44      0.40      4869

    micro avg       0.44      0.44      0.44     25292
    macro avg       0.40      0.41      0.40     25292
 weighted avg       0.45      0.44      0.44     25292

Both Series characters
               precision    recall  f1-score   support

 Bart Simpson       0.29      0.28     

In [12]:
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
smote_sampler = SMOTE(random_state = 42)
X_train_mer_bal, y_train_mer_bal = smote_sampler.fit_resample(X_train_mer,y_train_mer)

mnb_mer = MultinomialNB()
mnb_mer = mnb_mer.fit(X_train_mer,y_train_mer)
print("Unbalanced Dataset")
print(metrics.classification_report(y_test_mer,mnb_mer.predict(X_test_mer)))


mnb_bal = MultinomialNB()
mnb_bal = mnb_bal.fit(X_train_mer_bal,y_train_mer_bal)
print("Balanced Dataset")
print(metrics.classification_report(y_test_mer,mnb_bal.predict(X_test_mer)))


Unbalanced Dataset
               precision    recall  f1-score   support

 Bart Simpson       0.41      0.17      0.24      4872
      Cartman       0.50      0.40      0.44      5148
Homer Simpson       0.40      0.86      0.55     11583
         Kyle       0.51      0.13      0.20      2774
 Lisa Simpson       0.40      0.16      0.23      3968
Marge Simpson       0.50      0.21      0.30      4869
         Stan       0.43      0.13      0.20      2890

    micro avg       0.42      0.42      0.42     36104
    macro avg       0.45      0.29      0.31     36104
 weighted avg       0.44      0.42      0.37     36104

Balanced Dataset
               precision    recall  f1-score   support

 Bart Simpson       0.28      0.27      0.28      4872
      Cartman       0.41      0.41      0.41      5148
Homer Simpson       0.51      0.40      0.45     11583
         Kyle       0.23      0.24      0.24      2774
 Lisa Simpson       0.30      0.29      0.29      3968
Marge Simpson       0.34 

In [64]:
score = mnb_bal.predict_proba(X_test_mer)
print(score[300])
print(mnb_bal.predict(X_test_mer[300]))
y_test_mer

bart_counter = 0
cartman_counter = 0
homer_counter = 0
kyle_counter = 0
lisa_counter = 0
marge_counter = 0
stan_counter = 0
for val in y_train_mer:
    if (val == 'Cartman'):
        cartman_counter += 1
    if (val == 'Marge Simpson'):
        marge_counter += 1
    if (val == 'Homer Simpson'):
        homer_counter += 1
    if (val == 'Kyle'):
        kyle_counter += 1
    if (val == 'Lisa Simpson'):
        lisa_counter += 1
    if (val == 'Bart Simpson'):
        bart_counter += 1
    if (val == 'Stan'):
        stan_counter += 1

bart_counter /= len(y_train_mer)
cartman_counter /= len(y_train_mer)
homer_counter /= len(y_train_mer)
kyle_counter /= len(y_train_mer)
lisa_counter /= len(y_train_mer)
marge_counter /= len(y_train_mer)
stan_counter /= len(y_train_mer)

for i,val in enumerate(score):
    if (y_test_mer[i] == "Cartman"):
        s = val[1]/cartman_counter
        print(s)

[0.07810389 0.01897682 0.09972423 0.06752739 0.18804391 0.51176869
 0.03585506]
['Marge Simpson']
6.199755124161013
6.395281263039314
3.617874722371299
1.2926806295447255
0.6446026270526524
6.177696811176893
1.3002094461423506
5.422277678890653
1.8088952926381572
4.575459523572675
5.130998513294693
1.448821278876849
0.9948819192123212
0.35049464464288044
0.8799598022170354
0.9663310485890972
1.203420451112502
0.8865296449717828
6.189387016195716
0.6759125421684657
0.8638191322247096
3.0248080547693106
3.4882120122931686
0.9663396743668149
0.9421532964833317
3.277962042096176
0.7144579714188919
1.5423891713598197
6.462825446125378
0.6318892620454182
4.007037477300294
1.2777088372661423
6.087581591212726
1.4676462701823774
0.888921450792904
0.7127362198583047
1.0965645463585691
5.100204258659138
4.4424144470427205
0.8175757149809556
0.2511959740106423
1.8580841647585413
1.2138442738838169
1.0018128844286178
6.528325827942723
1.230581665189563
0.32554149860261516
1.3978266789362317
1.6181

Observe that unbalanced data creates unbalanced predictions.
<br>Using imbalanced learn library to make us eof oversampling

In [14]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
clf = LogisticRegression

pipeline_sp = make_pipeline(SMOTE(random_state = 42), clf(solver='lbfgs', max_iter = 1000,random_state = 42,multi_class = 'ovr'))
model_sp = pipeline_sp.fit(X_train_sp,y_train_sp)
predicted = model_sp.predict(X_test_sp)
print(metrics.confusion_matrix(y_test_sp, predicted))
print(metrics.classification_report(y_test_sp, predicted))

pipeline_sim = make_pipeline(SMOTE(random_state = 42), clf(solver='lbfgs', max_iter = 1000,random_state = 42,multi_class = 'ovr'))
model_sim = pipeline_sim.fit(X_train_sim,y_train_sim)
predicted = model_sim.predict(X_test_sim)
print(metrics.confusion_matrix(y_test_sim, predicted))
print(metrics.classification_report(y_test_sim, predicted))


pipeline_mer = make_pipeline(SMOTE(random_state = 42), clf(solver='lbfgs', max_iter = 1000,random_state = 42,multi_class = 'ovr'))
model_mer = pipeline_mer.fit(X_train_mer,y_train_mer)
predicted = model_mer.predict(X_test_mer)
print(metrics.confusion_matrix(y_test_mer, predicted))
print(metrics.classification_report(y_test_mer, predicted))

[[2878 1175 1095]
 [ 740 1152  882]
 [ 854  898 1138]]
              precision    recall  f1-score   support

     Cartman       0.64      0.56      0.60      5148
        Kyle       0.36      0.42      0.38      2774
        Stan       0.37      0.39      0.38      2890

   micro avg       0.48      0.48      0.48     10812
   macro avg       0.46      0.46      0.45     10812
weighted avg       0.50      0.48      0.48     10812

[[1612 1450 1023  787]
 [1961 5508 1924 2190]
 [ 733 1055 1442  738]
 [ 533 1469  824 2043]]
               precision    recall  f1-score   support

 Bart Simpson       0.33      0.33      0.33      4872
Homer Simpson       0.58      0.48      0.52     11583
 Lisa Simpson       0.28      0.36      0.31      3968
Marge Simpson       0.35      0.42      0.38      4869

    micro avg       0.42      0.42      0.42     25292
    macro avg       0.39      0.40      0.39     25292
 weighted avg       0.44      0.42      0.43     25292

[[1313  403 1159  268  742  