# Machine Learning Modeling
For our purposes, we want to try a few different models to see which one works the best with our data. The first one we will try is Support Vector Machines. Before that, though, we'll need to split our data into training and testing.

In [25]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import joblib


data = pd.read_csv('data/moviedata_tokens.csv')

In [18]:
data.head()

Unnamed: 0,movie,character_name,line_num,line,unigram_tokens,bigram_unigram_tokens
0,American Psycho,Bateman,0,"we're sitting in pastels, this nouvelle northe...","['we', ""'re"", 'sitting', 'in', ',', 'this', 'p...","['we', ""'re"", 'sitting', 'in', ',', 'this', 'p..."
1,American Psycho,Bateman,1,you'll notice that my friends and i all look a...,"['you', ""'ll"", 'that', 'my', 'friends', 'and',...","['you', ""'ll"", 'that', 'my', 'friends', 'and',..."
2,American Psycho,Bateman,2,or can it be worn with a suit?,"['or', 'can', 'it', 'be', 'with', 'a', 'suit',...","['or', 'can', 'it', 'be', 'with', 'a', 'suit',..."
3,American Psycho,Bateman,3,with discreet pinstripes you should wear a sub...,"['with', 'you', 'should', 'wear', 'a', 'blue',...","['with', 'you', 'should', 'wear', 'a', 'blue',..."
4,American Psycho,Bateman,4,van patten looks puffy. has he stopped working...,"['van', 'patten', 'looks', 'puffy', '.', 'has'...","['van', 'patten', 'looks', 'puffy', '.', 'has'..."


## Unigrams Only

In [19]:
# Splitting data into training and testing
X = data['unigram_tokens']
y = data['character_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=222)

### Support Vector Machine

In [20]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,1))),
    ('clf', SVC())
])

# Define parameter grid to search over
param_grid = {
    'tfidf__stop_words': ['english', None],
    'tfidf__max_features': [1000, 2000, 5000],
    'tfidf__max_df': [0.05, 0.1, 0.2,],
    'clf__C': [1, 10],
    'clf__kernel': ['linear', 'poly', 'rbf']
}

# Creating K-Fold cross-validation model
kf = KFold(n_splits=10, shuffle=True, random_state=222)

# Define GridSearchCV
uni_svm_grid = GridSearchCV(pipeline, param_grid, cv=kf, n_jobs=-1)

# Fit the GridSearch CV object to the training data
uni_svm_grid.fit(X_train, y_train)

print("Best parameters: ", uni_svm_grid.best_params_)
print("Best score: ", uni_svm_grid.best_score_)

Best parameters:  {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__max_df': 0.1, 'tfidf__max_features': 2000, 'tfidf__stop_words': 'english'}
Best score:  0.4603833425261996


In [28]:
joblib.dump(uni_svm_grid, 'models/uni_svm_grid.joblib')

['models/uni_svm_grid.joblib']

0.460

### Multi-Layer Perceptron (MLP)

In [21]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,1))),
    ('clf', MLPClassifier(max_iter=1000, activation = 'relu', solver='adam'))
])

# Define parameter grid to search over
param_grid = {
    'tfidf__stop_words': ['english', None],
    'tfidf__max_features': [3000, 5000, 10000],
    'tfidf__max_df': [0.05, 0.1, 0.2,],
    'clf__hidden_layer_sizes': [(10,), (20,), (10,10)]
}

# Creating K-Fold cross-validation model
kf = KFold(n_splits=10, shuffle=True, random_state=222)

# Define GridSearchCV
uni_mlp_grid = GridSearchCV(pipeline, param_grid, cv=kf, n_jobs=-1)

# Fit the GridSearch CV object to the training data
uni_mlp_grid.fit(X_train, y_train)

print("Best parameters: ", uni_mlp_grid.best_params_)
print("Best score: ", uni_mlp_grid.best_score_)

Best parameters:  {'clf__hidden_layer_sizes': (10,), 'tfidf__max_df': 0.2, 'tfidf__max_features': 10000, 'tfidf__stop_words': 'english'}
Best score:  0.4746093031807318


In [27]:
joblib.dump(uni_mlp_grid, 'models/uni_mlp_grid.joblib')

['models/uni_mlp_grid.joblib']

0.469

## Unigrams and Bigrams

In [22]:
# Splitting data into training and testing
X = data['bigram_unigram_tokens']
y = data['character_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=222)

### Support Vector Machine
Here, we will create a pipeline for support vector machines and then tune it to find the best parameters. We will use **k-fold cross validation** to tune, and then find what the best parameters are and what the score is. We will try this for unigrams and unigrams + bigrams.

In [23]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('clf', SVC())
])

# Define parameter grid to search over
param_grid = {
    'tfidf__stop_words': ['english', None],
    'tfidf__max_features': [10000, 20000, 30000],
    'tfidf__max_df': [0.05, 0.1, 0.15],
    'clf__C': [1, 10, 100],
    'clf__kernel': ['linear']
}

# Creating K-Fold cross-validation model
kf = KFold(n_splits=10, shuffle=True, random_state=222)

# Define GridSearchCV
bi_svm_grid = GridSearchCV(pipeline, param_grid, cv=kf, n_jobs=-1)

# Fit the GridSearch CV object to the training data
bi_svm_grid.fit(X_train, y_train)

print("Best parameters: ", bi_svm_grid.best_params_)
print("Best score: ", bi_svm_grid.best_score_)

Best parameters:  {'clf__C': 10, 'clf__kernel': 'linear', 'tfidf__max_df': 0.15, 'tfidf__max_features': 30000, 'tfidf__stop_words': 'english'}
Best score:  0.45971226328369186


In [29]:
joblib.dump(bi_svm_grid, 'models/bi_svm_grid.joblib')

['models/bi_svm_grid.joblib']

0.460

### Multi-Layer Perceptron (MLP)

In [24]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('clf', MLPClassifier(max_iter=1000, activation = 'relu', solver='adam'))
])

# Define parameter grid to search over
param_grid = {
    'tfidf__stop_words': ['english', None],
    'tfidf__max_features': [10000, 20000, 30000],
    'tfidf__max_df': [0.05, 0.1, 0.2,],
    'clf__hidden_layer_sizes': [(10,), (20,), (10,10)]
}

# Creating K-Fold cross-validation model
kf = KFold(n_splits=10, shuffle=True, random_state=222)

# Define GridSearchCV
bi_mlp_grid = GridSearchCV(pipeline, param_grid, cv=kf, n_jobs=-1)

# Fit the GridSearch CV object to the training data
bi_mlp_grid.fit(X_train, y_train)

print("Best parameters: ", bi_mlp_grid.best_params_)
print("Best score: ", bi_mlp_grid.best_score_)

KeyboardInterrupt: 

0.434

## Sequence Vectors

### Support Vector Machine

In [None]:
# Splitting data into training and testing
X = data['line']
y = data['character_name']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, test_size=0.2, random_state=222)

In [None]:
pipeline3 = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', SVC())
])

# Define parameter grid to search over
param_grid = {
    'tfidf__stop_words': ['english', None],
    'tfidf__max_features': [10000, 15000, 20000],
    'tfidf__max_df': [0.1, 0.5, 0.9],
    'clf__C': [1, 10, 100],
    'clf__kernel': ['linear']
}

# Creating K-Fold cross-validation model
kf = KFold(n_splits=10, shuffle=True, random_state=222)

# Define GridSearchCV
seq_svm_grid = GridSearchCV(pipeline3, param_grid, cv=kf, n_jobs=-1)

# Fit the GridSearch CV object to the training data
seq_svm_grid.fit(X_train, y_train)

print("Best parameters: ", seq_svm_grid.best_params_)
print("Best score: ", seq_svm_grid.best_score_)

Best parameters:  {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__max_df': 0.1, 'tfidf__max_features': 20000, 'tfidf__stop_words': 'english'}
Best score:  0.4590319911748484


0.459

### Multi-Layer Perceptron (MLP)

In [None]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MLPClassifier(max_iter=1000, activation = 'relu', solver='adam'))
])

# Define parameter grid to search over
param_grid = {
    'tfidf__stop_words': ['english', None],
    'tfidf__max_features': [10000, 15000, 20000],
    'tfidf__max_df': [0.05, 0.1, 0.2,],
    'clf__hidden_layer_sizes': [(10,), (20,), (10,10)]
}

# Creating K-Fold cross-validation model
kf = KFold(n_splits=10, shuffle=True, random_state=222)

# Define GridSearchCV
seq_mlp_grid = GridSearchCV(pipeline, param_grid, cv=kf, n_jobs=-1)

# Fit the GridSearch CV object to the training data
seq_mlp_grid.fit(X_train, y_train)

print("Best parameters: ", seq_mlp_grid.best_params_)
print("Best score: ", seq_mlp_grid.best_score_)

Best parameters:  {'clf__hidden_layer_sizes': (20,), 'tfidf__max_df': 0.1, 'tfidf__max_features': 5000, 'tfidf__stop_words': 'english'}
Best score:  0.46444658944658956


0.464