In [31]:
import os
import pickle
import pandas as pd
import sklearn
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif, SelectKBest
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import pipeline
from sklearn.grid_search import GridSearchCV
from prettytable import PrettyTable
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

#set data path
LOCAL_DATA_PATH = 'C:\Users\JoAnna\political_history\processed_data'
os.chdir(LOCAL_DATA_PATH)

#### Read in data, perform train/test split

In [2]:
#import data
labels = pickle.load(open('bow_labels.pkl', "r"))
text = pickle.load(open('bow_processed_text.pkl', "r"))

In [3]:
#train/test split of data (randomized)
text_train, text_test, labels_train, labels_test = cross_validation.train_test_split(text, labels, test_size=0.2, random_state=42)

TO DO: see how paragraphs cluster with party affiliation after TF-IDF  
http://stackoverflow.com/questions/28160335/plot-a-document-tfidf-2d-graph

Set-up evaluation metrics table

1) test classifiers w/bow using select percentile, PCA and Feature union

2) test classifiers adding n-gram range on tf-idf

3) test classifiers adding n-gram range and frequency cut-offs

4) word2vec instead of tf-idf using cbow, and above steps

5) Explore features and parameters of best classifier 

EDA:

1) clustering by year, tf-idf, word2vec

2) seperating clustering

3) most frequent words - histograms

#### Test Classifier

In [43]:
#tfidf vectorizer and numpy array
vectorizer = TfidfVectorizer(sublinear_tf=True)
text_train_transformed = vectorizer.fit_transform(text_train).toarray()
text_test_transformed  = vectorizer.transform(text_test).toarray()

#test vectorizer
#print len(vectorizer.get_feature_names())
feature_names = vectorizer.get_feature_names()
#feature_names[5000:5020]

In [44]:
#build classifier pipeline
select = SelectPercentile(f_classif)
pca = PCA()
feature_selection = FeatureUnion([('select', select), ('pca', pca)],
                    transformer_weights={'pca': 10})
clfNB = GaussianNB()

steps1 = [('feature_selection', feature_selection),
        ('naive_bayes', clf)]

pipeline1 = sklearn.pipeline.Pipeline(steps1)

#search for best parameters
parameters1 = dict(feature_selection__select__percentile=[.05, .1, .25], 
              feature_selection__pca__n_components=[10, 50, 100])

cv = sklearn.grid_search.GridSearchCV(pipeline1, param_grid=parameters1)

#because tf-idf vectorizer isn't in this pipeline, fit/predict on transformed data
cv.fit(text_train_transformed, labels_train)
pred = cv.predict(text_test_transformed)

print cv.best_params_

#pipeline.fit(features_train, labels_train)
#pred = pipeline.predict(features_test)
report = sklearn.metrics.classification_report(labels_test, pred)
print report


{'feature_selection__select__percentile': 0.25, 'feature_selection__pca__n_components': 100}
             precision    recall  f1-score   support

          0       0.77      0.32      0.45       600
          1       0.56      0.90      0.69       572

avg / total       0.67      0.60      0.57      1172



In [45]:
accuracy = sklearn.metrics.accuracy_score(labels_test, pred)
print accuracy

0.60409556314


#### Building the steps of the classifier

In [32]:
#set up scoring function and table
scoring_table = PrettyTable(['pipeline_name', 'accuracy', 'precision', 'recall', 'auc'])

def scoring_function(pipeline_name, test_labels, prediction):
    """
    runs evaluation metrics on prediction from classifier
    Args:
        labels from the test data set, prediction from classifier     
    Returns:
        prints scoring functions, appends scores to scoring dataframe
    """
    accuracy = sklearn.metrics.accuracy_score(labels, prediction)
    precision = sklearn.metrics.precision_score(labels, prediction)
    recall = sklearn.metrics.recall_score(labels, prediction)
    auc = sklearn.metrics.roc_auc_score(labels, prediction)
    print "Validation Metrics for %s: accuracy: %s, precision: %s, recall: %s, auc: %s"%(pipeline_name, accuracy, precision, recall, auc)
    
    scoring_table.add_row([pipeline_name, accuracy, precision, recall, auc])
    return scoring_table

In [46]:
#test scoring function using test classifier above
scoring_function('test1', labels_test, pred)
print scoring_table

Validation Metrics for test1: accuracy: 0.60409556314, precision: 0.558441558442, recall: 0.902097902098, auc: 0.611048951049
+---------------+---------------+----------------+----------------+----------------+
| pipeline_name |    accuracy   |   precision    |     recall     |      auc       |
+---------------+---------------+----------------+----------------+----------------+
|     test1     | 0.60409556314 | 0.558441558442 | 0.902097902098 | 0.611048951049 |
|     test1     | 0.60409556314 | 0.558441558442 | 0.902097902098 | 0.611048951049 |
|     test1     | 0.60409556314 | 0.558441558442 | 0.902097902098 | 0.611048951049 |
+---------------+---------------+----------------+----------------+----------------+


In [35]:
# set-up generic grid-search cv function
def gridsearch_pipeline(pipeline_name, train_data, train_labels, test_data, test_labels, pipeline_steps, parameters):
    """
    generic function to run gridsearchcv on an input dataset, pipeline, and parameters
    Args:
        data separated into features/labels and train/test
        steps of the pipeline
        parameters for gridsearchcv
    Returns:
        best parameters from gridsearch, validation metrics (accuracy, precision, recall, auc)
    """
    #pipeline
    pipe = sklearn.pipeline.Pipeline(pipeline_steps)
    
    #gridsearch
    cv = sklearn.grid_search.GridSearchCV(pipe, param_grid=parameters)
    cv.fit(train_data, train_labels)
    pred = cv.predict(test_data)
    print cv.best_params_
    
    # validation metrics
    scoring_function(pipeline_name, test_labels, pred)


In [42]:
#Put together pieces of classifier

#tf-idf vectorizer
vectorizer1 = TfidfVectorizer(sublinear_tf=True)
vectorizer2 = TfidfVectorizer(max_df = 0.8, min_df = 0.2, sublinear_tf=True)
vectorizer3 = TfidfVectorizer(ngram_range = (1,3), sublinear_tf=True)
vectorizer4 = TfidfVectorizer(max_df = 0.8, min_df = 0.2, ngram_range = (1,3), sublinear_tf=True)

#feature selection
select = SelectPercentile(f_classif)
pca = PCA()
feature_selection = FeatureUnion([('select', select), ('pca', pca)],
                    transformer_weights={'pca': 10})

#classifier
clfNB = GaussianNB()
clfAdaBoost = AdaBoostClassifier(random_state = 42)
clfLR = LogisticRegression(random_state=42, solver='sag')

#### Testing Classifiers

In [50]:
#test2
steps = [('vectorizer', vectorizer1),
         ('feature_pick', pca),
         ('classifier', clfNB)]

params = dict(feature_pick__select__n_components=[10, 50, 100],
              classifier__priors=[None])

gridsearch_pipeline('test2', text_train, labels_train, text_test, labels_test, steps, params)

ValueError: Invalid parameter priors for estimator GaussianNB. Check the list of available parameters with `estimator.get_params().keys()`.

NameError: name 'estimator' is not defined