# Create Feature Set Using a Variety of NLP techniques

1. Attempted Random Forest & gradient boost classification models on entire congress dataset to predict the speakers political party.
    1. Results - accuracy was not great. The range of topics and individual speakers may be too dificult a challenge
    2. Note - Pipelines for this section are actually on the bottom. Since classification worked better with just 1 debate topic I kept those pipelines near the top

2. Classification models on top debate (debate num.132 which has >350 rep and 350 dem speech segments)
    1. Results - accuracy greatly improved from general models
    2. Successfully extracted a features set using word2vec Note: could not figure out normalize function 
    3. Didn't see a huge difference in accuracy between NLP feature sets
    4. Question: is there an efficient way to check how accurate a word2vec model is besides checking how it scores the similarity between words I consider similar?
    
3. Attempting Clustering on 14 speakers with the most speech segments - haven't done this yet so here are the steps I'm trying to complete before Tuesday session:
    1. Set up a pipeline that will Build a TFIDF feature set on 80% of sample speeches
    2. Try out a method of clustering on sample speeches - see if there are any obvious patterns
    3. Next step would be based on if obvious pattern pops up

In [29]:
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import time
import os
%matplotlib inline

from sklearn.svm import SVC
from sklearn import ensemble
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict 
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
nlp = spacy.load('en')

from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from gensim import utils, models
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec


In [5]:
from gensim.test.utils import datapath
from gensim.models import Word2Vec
from gensim.models import word2vec

In [6]:
# create a list of all the file names in the development set
def pywalker(path):
    file_list = []
    for root, dirs, files in os.walk(path):
        for file_ in files:
            file_list.append( os.path.join(root, file_) )
    return file_list
 
if __name__ == '__main__':
    pywalker('/path/to/some/folder')

def import_files(file_list, path):
    debate_list = []
    speaker_list = []
    page_list = []
    page_position_list = []
    party_list = []
    mention_list = []
    vote_list = []
    file_text = []

    for file in file_list:
        # Read the speech segment and add to 
        myfile = open(file)
        file_text.append(myfile.read())
        myfile.close()
        
        file = file.replace((str(path)+'/'),'').replace('.txt','')
        debate_list.append(file[0:3])
        speaker_list.append(file[4:10])
        page_list.append(file[11:15])
        page_position_list.append(file[15:18])
        party_list.append(file[-3])
        mention_list.append(file[-2])
        vote_list.append(file[-1])


    columns = ['file_name', 'debate', 'speaker', 'page', 'page_position', 
               'party', 'mention', 'vote', 'speech']

    data = {'file_name':file_list, 'debate':debate_list, 
            'speaker':speaker_list, 'page':page_list, 
            'page_position':page_position_list,
            'party':party_list, 'mention':mention_list, 
            'vote':vote_list, 'speech':file_text}

    text_details = pd.DataFrame(data=data, columns = columns)
    
    return text_details


def random_sort(table):
    table['random'] = np.random.rand(len(table.index),1)
    table = table.sort_values(by='random')
    table = table.reset_index(drop=True).drop(columns='random', axis=1)
    return table

# This function just clears out know features that are not needed
def text_cleaner(text):
    text = re.sub(r'xz[0-9]{7}','', text)
    text = ' '.join(text.split())
    return text

I have a few things I want try
1. See if I am able to create a better classifier if I work with just 1 debate
    * bow
    * tfidf
    * word2vec
2. Cluster on speaker (5 dems & 5 republicans with the most speech segments)

In [43]:
# Set baseline for the accuracy score
dev_text['party'].value_counts()/len(dev_text['party'])

R    0.523538
D    0.476462
Name: party, dtype: float64

In [7]:
dev_path = 'senate/data_stage_one/development_set'
train_path = 'senate/data_stage_one/training_set'
test_path = 'senate/data_stage_one/test_set'
dev_text = import_files(pywalker(dev_path), dev_path)
train_text = import_files(pywalker(train_path), train_path)
test_text = import_files(pywalker(test_path), test_path)

# I want to be able to easily see what original data source they are in (I have a feeling won't be that imp)
dev_text['source'] = 'dev'
train_text['source'] = 'train'
test_text['source'] = 'test'

# Put all speech segments in one doc
full_text = pd.concat([dev_text, train_text, test_text])

### Create 1 dataset of the debate with most speeches & 1 dataset with the top 14 speakers

In [44]:
# dev_text['speech_clean'] = dev_text['speech'].apply(lambda x: text_cleaner(x))

In [8]:
# Who are the top 5 dem and top 5 rep speakers
top_14 = full_text.groupby(by=['speaker','party']).count().sort_values(by='source', ascending=False).iloc[:15,:]
top_speaker = top_14.reset_index()['speaker']

# Builds a dataset of just the speeches give by my top 14 speakers
top_speaker = full_text[full_text['speaker'].isin(top_speaker)].reset_index()

In [9]:
speech_count = full_text.groupby(by=['debate', 'source', 'party']).count()['file_name']
speech_count.sort_values(ascending=False).head(5)

debate  source  party
132     train   R        383
                D        369
031     train   R        346
088     train   D        340
031     train   D        332
Name: file_name, dtype: int64

In [10]:
# Filter to the top debate
top_debate = full_text[full_text['debate']=='132'].reset_index(drop=True)

# Clean top_debate for BOW & tfidf
top_debate['speech_clean'] = top_debate['speech'].apply(lambda x: text_cleaner(x))
top_debate = random_sort(top_debate)

____ 
### TOP DEBATE
Lets create a pipeline to test out classification on top_debate

In [22]:
# Bag of Words using tfidf vectorizer
X = top_debate['speech_clean']
y = top_debate['party']


# Training the BOW vectorizor
bow_vectorizer = TfidfVectorizer(
    max_features=2000,      # if an integer vectorizer returns with most used words  
    use_idf=False,          # Use IDF
    norm=u'l2',             # Correction factor to treat long and short paragraphs equally
    smooth_idf=True,        # Prevents divide-by-zero errors by adding one to all features
    stop_words='english',  
    lowercase=True,          
                                 )
pipeline = Pipeline([
    ('vect', bow_vectorizer),
    ('svd', TruncatedSVD()),
    ('norm', Normalizer(copy=False)),
    ('clf', RandomForestClassifier(n_jobs=-1))
])

parameters = {
    'svd__n_components': (50,70,),       #50
    'clf__criterion': ('entropy',),
    'clf__max_depth': (4,7,9, None),           #9
    'clf__n_estimators': (70, 150,), #70
    'vect__max_df': (0.5,),             # Drop words that appear in > x % of the paragraphs
    'vect__max_features': (2000,),     # return top X used words
}

grid_search = GridSearchCV(pipeline, parameters, refit=True)
grid_search.fit(X, y)

print('Best score: %.3f'%grid_search.best_score_)
print('Cross Val: \n{}\n'.format(cross_val_score(grid_search.best_estimator_, X, y, cv=3)))

print('Best parameters:')
grid_search.best_params_    

Best score: 0.641
Cross Val: 
[0.60159363 0.61752988 0.628     ]

Best parameters: 



{'clf__criterion': 'entropy',
 'clf__max_depth': 7,
 'clf__n_estimators': 70,
 'svd__n_components': 70,
 'vect__max_df': 0.5,
 'vect__max_features': 2000}

### See if gradient boost does a little more for me without adding overfitting

In [14]:
# Bag of Words using tfidf vectorizer
X = top_debate['speech_clean']
y = top_debate['party']

# Training the BOW vectorizor
bow_vectorizer = TfidfVectorizer(
    max_features=2000,      # if an integer vectorizer returns with most used words  
    use_idf=False,          # Use IDF
    norm=u'l2',             # Correction factor to treat long and short paragraphs equally
    smooth_idf=True,        # Prevents divide-by-zero errors by adding one to all features
    stop_words='english',  
    lowercase=True,          
                                 )
pipeline = Pipeline([
    ('vect', bow_vectorizer),
    ('svd', TruncatedSVD()),
    ('norm', Normalizer(copy=False)),
    ('clf', GradientBoostingClassifier())
])

parameters = {
    'clf__learning_rate': (0.2, 0.5, 0.8,),
    'clf__max_depth': (4,7,None,),           #9
    'clf__n_estimators': (50, 70, 100,), #70
    'clf__subsample': (0.3, 0.5, 0.7),
    'vect__max_df': (0.5,),             # Drop words that appear in > x % of the paragraphs
    'vect__max_features': (2000,2500),     # return top X used words
    'svd__n_components': (50,70,),       #50
}

grid_search = GridSearchCV(pipeline, parameters, refit=True)
grid_search.fit(X, y)

print('Best score: %.3f'%grid_search.best_score_)
print('Cross Val: \n{}\n'.format(cross_val_score(grid_search.best_estimator_, X, y, cv=3)))
print('Best parameters: \n')
grid_search.best_params_    

Best score: 0.641
Cross Val: 
[0.62151394 0.6374502  0.62      ]

Best parameters: 



{'clf__learning_rate': 0.2,
 'clf__max_depth': 7,
 'clf__n_estimators': 70,
 'clf__subsample': 0.5,
 'svd__n_components': 50,
 'vect__max_df': 0.5,
 'vect__max_features': 2000}

Comparing Random forest and grandient boost they seem to have very similar effectiveness with accuracy.

____ 
### TOP DEBATES
### TFIDF

In [25]:
#TFIDF Optimizing th TFIDF features
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.5,            # drop words that occur in more than half the paragraphs
    min_df=2,              # Use words that appear >= 2x     
    use_idf=True,          # Use IDF
    norm=u'l2',            # Correction factor to treat long and short paragraphs equally
    smooth_idf=True,        # Prevents divide-by-zero errors by adding one to all features
    stop_words='english',  
    lowercase=True,        
    )

pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('svd', TruncatedSVD()),
    ('norm', Normalizer(copy=False)),
    ('clf', RandomForestClassifier(n_jobs=-1))
])

parameters = {
    'clf__criterion': ('entropy',),
    'clf__max_depth': (4,7,),
    'clf__n_estimators': (70,130,),   
    'svd__n_components': (50,70,),    
    'tfidf__max_df': (0.5,0.8),
    'tfidf__min_df': (2,5,9,),   
}

grid_search_tfidf = GridSearchCV(pipeline, parameters, refit=True)
grid_search_tfidf.fit(X,y)

print('Best score: %.3f'%grid_search_tfidf.best_score_)
print('Cross Val: \n{}\n'.format(cross_val_score(grid_search_tfidf.best_estimator_, X, y, cv=3)))
print('Best parameters:')
grid_search_tfidf.best_params_    

Best score: 0.660
Cross Val: 
[0.63346614 0.62948207 0.696     ]

Best parameters:


{'clf__criterion': 'entropy',
 'clf__max_depth': 4,
 'clf__n_estimators': 70,
 'svd__n_components': 50,
 'tfidf__max_df': 0.8,
 'tfidf__min_df': 9}

____
### TOP DEBATES
### Word2Vec

In [31]:
from sklearn.preprocessing import normalize
X = top_debate['speech_clean']
y = top_debate['party']
top_debate['speech_processed'] = top_debate['speech_clean'].apply(lambda x: utils.simple_preprocess(x))

In [37]:
# Create document which is a list of all the speeches
processed_text = list(top_debate['speech_processed'])

In [139]:
# build vocabulary and train model
model = models.Word2Vec(
        processed_text,
        workers=4,     # Number of threads to run in parallel
        min_count=10,  # Minimum word count threshold.
        window=12,      # Number of words around target word to consider.
        sg=0,          # Use CBOW because our corpus is small.
        sample=1e-3 ,  # Penalize frequent words.
        size=300,      # Word vector length.
        hs=1           # Use hierarchical softmax.
)
model.train(processed_text, total_examples=len(processed_text), epochs=10)

def text_vector(doc):    
    output_vec = np.zeros(model.vector_size)
    for word in doc:
        if word in model.wv.vocab:
            output_vec += model[word]
##### tried to use normalize but its doesn't like the input output_vec += normalize(model[word], axis=0)
    return output_vec
            
top_debate['doc_vector'] = top_debate['speech_processed'].apply(lambda x: text_vector(x))



In [153]:
# See if model is acting logically
# Similarity is calculated using the cosine, so again 1 is total
# similarity and 0 is no similarity.

model = models.Word2Vec(
        processed_text,
        workers=4,     # Number of threads to run in parallel
        min_count=10,  # Minimum word count threshold.
        window=6,      # Number of words around target word to consider.
        sg=0,          # Use CBOW because our corpus is small.
        sample=1e-3 ,  # Penalize frequent words.
        size=300,      # Word vector length.
        hs=1           # Use hierarchical softmax.
)
model.train(processed_text, total_examples=len(processed_text), epochs=10)
          
print(model.wv.similarity('environment', 'air'))
print(model.wv.similarity('law', 'bill'))

0.403232
0.22371931


  if np.issubdtype(vec.dtype, np.int):


Testing out a bunch of combinations for the word2vec model builder. I didn't see a huge optimization through fiddling.
the same parameter set really varied by each run

|Description|Best Score|CV Var|
|:---|:---:|:---:|
|wordvec len 300|0.649|0.021|
|wordvec len 200|0.642|0.043|
|wordvec len 100|0.641|0.033|
|min count 50|0.630|0.037|
|min count 10|0.649|0.039|
|min count 5 |0.633|0.022|
|min count 7 |0.640|0.015|
|window 6    |0.638|0.036|
|window 12   |0.630|0.015|
|window 30   |0.637|0.018|
|window 15   |0.622|0.038|

In [139]:
# This frame used to test out different parameters in the word2vec model
# Run this cell prior to rerunning pipelines below
model = models.Word2Vec(
        processed_text,
        workers=4,     # Number of threads to run in parallel
        min_count=10,  # Minimum word count threshold.
        window=12,      # Number of words around target word to consider.
        sg=0,          # Use CBOW because our corpus is small.
        sample=1e-3 ,  # Penalize frequent words.
        size=300,      # Word vector length.
        hs=1           # Use hierarchical softmax.
)
model.train(processed_text, total_examples=len(processed_text), epochs=10)

def text_vector(doc):    
    output_vec = np.zeros(model.vector_size)
    for word in doc:
        if word in model.wv.vocab:
            output_vec += model[word]
##### tried to use normalize but its doesn't like the input output_vec += normalize(model[word], axis=0)
    return output_vec
            
top_debate['doc_vector'] = top_debate['speech_processed'].apply(lambda x: text_vector(x))



In [142]:
y = top_debate['party']
X = pd.DataFrame(np.array(list(top_debate['doc_vector'])))
pipeline = Pipeline([
    ('svd', TruncatedSVD()),
    ('norm', Normalizer(copy=False)),
    ('clf', RandomForestClassifier(n_jobs=-1))
])

parameters = {
    'clf__criterion': ('entropy','gini'),
    'clf__max_depth': (4,5,9,None),
    'clf__n_estimators': (50,70,150),
    'svd__n_components': (5,10,30,50, 70),    
    #'tfidf__max_df': (0.5,0.8),
    #'tfidf__min_df': (2,5,9,),   
}

grid_search_word2 = GridSearchCV(pipeline, parameters, refit=True)
grid_search_word2.fit(X,y)

print('Best score: %.3f'%grid_search_word2.best_score_)
print('Cross Val: %.3f'%(np.std(cross_val_score(grid_search_word2.best_estimator_, X, y, cv=3))))
print('Best parameters:')
grid_search_word2.best_params_   



Best score: 0.656
Cross Val: 0.034
Best parameters:


{'clf__criterion': 'entropy',
 'clf__max_depth': 9,
 'clf__n_estimators': 150,
 'svd__n_components': 30}

In [144]:
# Would liket to see Gradient Boost too
y = top_debate['party']
X = pd.DataFrame(np.array(list(top_debate['doc_vector'])))

pipeline = Pipeline([
    ('svd', TruncatedSVD()),
    ('norm', Normalizer(copy=False)),
    ('clf', GradientBoostingClassifier())
])

parameters = {
    'clf__learning_rate': (0.2, 0.5,),
    'clf__max_depth': (4,7,9, None,),           
    'clf__n_estimators': (50, 70, 100,), 
    'clf__subsample': (0.3, 0.5, 0.7),
    'svd__n_components': (30, 50,),       
}
grid_search_word2 = GridSearchCV(pipeline, parameters, refit=True)
grid_search_word2.fit(X,y)

print('Best score: %.3f'%grid_search_word2.best_score_)
print('Cross Val: %.3f'%(np.std(cross_val_score(grid_search_word2.best_estimator_, X, y, cv=3))))
print('Best parameters:')
grid_search_word2.best_params_   

Best score: 0.654
Cross Val: 0.012
Best parameters:


{'clf__learning_rate': 0.2,
 'clf__max_depth': 4,
 'clf__n_estimators': 100,
 'clf__subsample': 0.5,
 'svd__n_components': 30}

____
### ALL DEBATES
### BOW Pipeline Using all Debates

In [56]:
# Bag of Words using tfidf vectorizer
# Training the BOW vectorizor
# It looks like it wants to use very few features that appear in most of the documents - this seems like a good way to overfit
bow_vectorizer = TfidfVectorizer(
    max_features=2000,      # if an integer vectorizer returns with most used words  
    use_idf=False,          # Use IDF
    norm=u'l2',             # Correction factor to treat long and short paragraphs equally
    smooth_idf=True,        # Prevents divide-by-zero errors by adding one to all features
    stop_words='english',  
    lowercase=True,          
                                 )
pipeline = Pipeline([
    ('vect', bow_vectorizer),
    ('svd', TruncatedSVD()),
    ('norm', Normalizer(copy=False)),
    ('clf', RandomForestClassifier(n_jobs=-1))
])

parameters = {
    'vect__max_features': (1000,),     # return top X used words
    'vect__max_df': (0.5,),           # Drop words that appear in > x % of the paragraphs
    'svd__n_components': (50,70,),   #50
    'clf__max_depth': (4,7,9),           #9
    'clf__n_estimators': (50, 70, 100,), #70
    'clf__criterion': ('entropy',),
}

grid_search = GridSearchCV(pipeline, parameters, refit=True)
grid_search.fit(dev_text['speech_clean'],dev_text['party'])

print('Best score: %.3f'%grid_search.best_score_)
print('Best parameters: \n{}\n'.format(grid_search.best_params_))

Best score: 0.579
Best parameters: {'clf__criterion': 'entropy', 'clf__max_depth': 9, 'clf__n_estimators': 70, 'svd__n_components': 50, 'vect__max_df': 0.5, 'vect__max_features': 1000}


The BOW model has improved from some of my previous iterations and is leveraging a larger feature set. Using the max_df threshold is making it less of a traditional bag of most common words. Random Forest is prone to overfitting so lets see how much variance there is between the folds of the winning model

In [57]:
# When I look at the St. Dev of the winning model its clear this is model is overfit
cross_val_score(grid_search.best_estimator_, dev_text['speech'],dev_text['party'] )

array([0.54893617, 0.50643777, 0.6223176 ])

Grandient Boost has a built in sampler to help with overfitting. Lets try this approach. To reduce fitting time I'm going to stick with 1000 features and 0.5 as the cutoff for super common words.

In [60]:
# BOW vectorizer with Gradient Boost
bow_vectorizer = TfidfVectorizer(
    max_features=2000,      # if an integer vectorizer returns with most used words  
    use_idf=False,          # Use IDF
    norm=u'l2',             # Correction factor to treat long and short paragraphs equally
    smooth_idf=True,        # Prevents divide-by-zero errors by adding one to all features
    stop_words='english',  
    lowercase=True,          
                                 )
pipeline = Pipeline([
    ('vect', bow_vectorizer),
    ('svd', TruncatedSVD()),
    ('norm', Normalizer(copy=False)),
    ('clf', GradientBoostingClassifier())
])

parameters = {
    'vect__max_features': (1000,),     
    'vect__max_df': (0.5,),           # Drop words that appear in > x % of the paragraphs
    'svd__n_components': (50,100),
    'clf__max_depth': (4,7,9),
    'clf__n_estimators': (10, 50, 100),
    'clf__learning_rate': (0.8, 0.2),
    'clf__subsample': (0.5, 0.3,)
}

grid_search_boost = GridSearchCV(pipeline, parameters, refit=True)
grid_search_boost.fit(dev_text['speech_clean'],dev_text['party'])

print('Best score: %.3f'%grid_search.best_score_)
print('Best parameters:')
grid_search_boost.best_params_

Best score: 0.579
Best parameters:


{'clf__learning_rate': 0.2,
 'clf__max_depth': 9,
 'clf__n_estimators': 100,
 'clf__subsample': 0.3,
 'svd__n_components': 50,
 'vect__max_df': 0.5,
 'vect__max_features': 1000}

In [83]:
# When I look at the St. Dev of the winning model its clear this is model is overfit
cross_val_score(grid_search_boost.best_estimator_, dev_text['speech'],dev_text['party'] )

array([0.5106383 , 0.527897  , 0.56652361])

Gradient Boost Seems a tad less overfit the random forest

# TFIDF - repeat the same process to create a tfidf vectorizor

**In the model below I'm really uncomfortable taht it is giing sam number of components for svd as for number of estimators**

In [81]:
#TFIDF Optimizing th TFIDF features
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.5,            # drop words that occur in more than half the paragraphs
    min_df=2,              # Use words that appear >= 2x     
    use_idf=True,          # Use IDF
    norm=u'l2',            # Correction factor to treat long and short paragraphs equally
    smooth_idf=True,        # Prevents divide-by-zero errors by adding one to all features
    stop_words='english',  
    lowercase=True,        
    )

pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('svd', TruncatedSVD()),
    ('norm', Normalizer(copy=False)),
    ('clf', RandomForestClassifier(n_jobs=-1))
])

parameters = {
    'tfidf__max_df': (0.5,),
    'tfidf__min_df': (2,),
    'svd__n_components': (50,100, 150),
    'clf__max_depth': (4,7,10, None),
    'clf__n_estimators': (10,40,100),
    'clf__criterion': ('entropy',),
}

grid_search_tfidf = GridSearchCV(pipeline, parameters, refit=True)
grid_search_tfidf.fit(dev_text['speech_clean'],dev_text['party'])

print( 'Best score: %.3f'%grid_search_tfidf.best_score_)
print('Best parameters:')
grid_search_tfidf.best_params_

Best score: 0.562
Best parameters:


{'clf__criterion': 'entropy',
 'clf__max_depth': 4,
 'clf__n_estimators': 100,
 'svd__n_components': 50,
 'tfidf__max_df': 0.5,
 'tfidf__min_df': 2}

In [82]:
# Variance here also not great
# Removing the Normalizer step made the accuracy much worse
# Maybe I should give myself space to fiddle with TFIDF parameters with constant Tree and then p
cross_val_score(grid_search_tfidf.best_estimator_, dev_text['speech'],dev_text['party'] )

array([0.52765957, 0.53218884, 0.58369099])

In [87]:
#TFIDF 
tfidf_vectorizer = TfidfVectorizer(
    max_df=0.5,            # drop words that occur in more than half the paragraphs
    min_df=2,              # Use words that appear >= 2x     
    use_idf=True,          # Use IDF
    norm=u'l2',            # Correction factor to treat long and short paragraphs equally
    smooth_idf=True,        # Prevents divide-by-zero errors by adding one to all features
    stop_words='english',  
    lowercase=True,        
    )

pipeline = Pipeline([
    ('tfidf', tfidf_vectorizer),
    ('svd', TruncatedSVD()),
    ('norm', Normalizer(copy=False,)),
    ('clf', GradientBoostingClassifier())
])

parameters = {
    'tfidf__max_df': (0.5,),
    'tfidf__min_df': (2,),
    'tfidf__norm': ('l2',),
    'svd__n_components': (70,100,200),
    'clf__max_depth': (4,7,9),
    'clf__n_estimators': (10, 50, 100),
    'clf__learning_rate': (0.8, 0.2),
    'clf__subsample': (0.5,)
}

grid_search_tfidf_boost = GridSearchCV(pipeline, parameters, refit=True)
grid_search_tfidf_boost.fit(dev_text['speech_clean'],dev_text['party'])

print( 'Best score: %.3f'%grid_search_tfidf_boost.best_score_)
print('Best parameters:/n')
grid_search_tfidf_boost.best_params_

Best score: 0.586
Best parameters:/n


{'clf__learning_rate': 0.2,
 'clf__max_depth': 4,
 'clf__n_estimators': 50,
 'clf__subsample': 0.5,
 'svd__n_components': 100,
 'tfidf__max_df': 0.5,
 'tfidf__min_df': 2,
 'tfidf__norm': 'l2'}

In [88]:
# When I look at the St. Dev of the winning model its clear this is model is overfit
cross_val_score(grid_search_tfidf_boost.best_estimator_, dev_text['speech'],dev_text['party'] )

array([0.52765957, 0.527897  , 0.58369099])