In [49]:
import os
import pickle
import pandas as pd
import sklearn
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif, SelectKBest
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn import pipeline
from sklearn.grid_search import GridSearchCV
from prettytable import PrettyTable
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

#set data path
LOCAL_DATA_PATH = 'C:\Users\JoAnna\political_history\processed_data'
SAVE_PATH = 'C:\Users\JoAnna\political_history\shibboleth\pkl_objects'
os.chdir(LOCAL_DATA_PATH)

#### Read in data, perform train/test split

In [21]:
#import data
labels = pickle.load(open('bow_labels.pkl', "r"))
text = pickle.load(open('bow_processed_text.pkl', "r"))

In [22]:
#train/test split of data (randomized)
text_train, text_test, labels_train, labels_test = cross_validation.train_test_split(text, labels, test_size=0.2, random_state=42)

TO DO: see how paragraphs cluster with party affiliation after TF-IDF  
http://stackoverflow.com/questions/28160335/plot-a-document-tfidf-2d-graph

EDA:

1) clustering by year, tf-idf, word2vec

2) seperating clustering

3) most frequent words - histograms

#### Test Classifier

In [23]:
#tfidf vectorizer and numpy array
vectorizer = TfidfVectorizer(sublinear_tf=True)
text_train_transformed = vectorizer.fit_transform(text_train).toarray()
text_test_transformed  = vectorizer.transform(text_test).toarray()

#test vectorizer
#print len(vectorizer.get_feature_names())
#feature_names = vectorizer.get_feature_names()
#feature_names[5000:5020]

In [24]:
#build classifier pipeline
select = SelectPercentile(f_classif)
pca = PCA()
feature_selection = FeatureUnion([('select', select), ('pca', pca)],
                    transformer_weights={'pca': 10})
clfNB = GaussianNB()

steps1 = [('feature_selection', feature_selection),
        ('naive_bayes', clfNB)]

pipeline1 = sklearn.pipeline.Pipeline(steps1)

#search for best parameters
parameters1 = dict(feature_selection__select__percentile=[.05, .1, .25], 
              feature_selection__pca__n_components=[10, 50, 100])

cv = sklearn.grid_search.GridSearchCV(pipeline1, param_grid=parameters1)

#because tf-idf vectorizer isn't in this pipeline, fit/predict on transformed data
cv.fit(text_train_transformed, labels_train)
pred = cv.predict(text_test_transformed)

print cv.best_params_

#pipeline.fit(features_train, labels_train)
#pred = pipeline.predict(features_test)
report = sklearn.metrics.classification_report(labels_test, pred)
print report


   80   85   87   88   90   96   97   99  109  112  113  114  118  121  124
  125  126  131  135  136  146  161  171  189  190  201  203  212  218  229
  239  244  245  246  255  274  276  307  313  324  355  366  371  379  383
  389  394  398  410  411  414  416  431  436  438  446  447  448  452  464
  470  490  495  502  505  515  518  528  529  530  536  537  547  558  561
  564  573  575  578  583  587  589  594  595  596  597  607  612  616  650
  654  664  673  675  678  681  685  690  696  699  700  720  733  734  753
  758  766  768  777  779  786  791  794  795  796  806  809  813  815  817
  843  846  848  850  862  865  866  875  876  880  888  890  895  903  907
  916  917  927  931  932  933  937  959  966  975  983  984 1011 1019 1025
 1033 1044 1049 1051 1067 1083 1085 1088 1100 1106 1108 1111 1116 1128 1129
 1164 1174 1180 1184 1185 1207 1209 1212 1228 1230 1238 1260 1265 1303 1324
 1325 1334 1335 1339 1344 1345 1361 1364 1365 1370 1392 1398 1401 1404 1409
 1419 1429 1

{'feature_selection__select__percentile': 0.25, 'feature_selection__pca__n_components': 100}
             precision    recall  f1-score   support

          0       0.77      0.32      0.45       600
          1       0.56      0.90      0.69       572

avg / total       0.67      0.60      0.57      1172



In [25]:
accuracy = sklearn.metrics.accuracy_score(labels_test, pred)
print accuracy

0.60409556314


#### Building the steps of the classifier

In [26]:
#set up scoring function and table
scoring_table = PrettyTable(['pipeline_name', 'accuracy', 'precision', 'recall', 'auc'])

def scoring_function(pipeline_name, test_labels, prediction):
    """
    runs evaluation metrics on prediction from classifier
    Args:
        labels from the test data set, prediction from classifier     
    Returns:
        prints scoring functions, appends scores to scoring dataframe
    """
    accuracy = sklearn.metrics.accuracy_score(test_labels, prediction)
    precision = sklearn.metrics.precision_score(test_labels, prediction)
    recall = sklearn.metrics.recall_score(test_labels, prediction)
    auc = sklearn.metrics.roc_auc_score(test_labels, prediction)
    print "Validation Metrics for %s: accuracy: %s, precision: %s, recall: %s, auc: %s"%(pipeline_name, accuracy, precision, recall, auc)
    
    scoring_table.add_row([pipeline_name, accuracy, precision, recall, auc])
    return scoring_table

In [27]:
#test scoring function using test classifier above
scoring_function('test1', labels_test, pred)
print scoring_table

Validation Metrics for test1: accuracy: 0.60409556314, precision: 0.558441558442, recall: 0.902097902098, auc: 0.611048951049
+---------------+---------------+----------------+----------------+----------------+
| pipeline_name |    accuracy   |   precision    |     recall     |      auc       |
+---------------+---------------+----------------+----------------+----------------+
|     test1     | 0.60409556314 | 0.558441558442 | 0.902097902098 | 0.611048951049 |
+---------------+---------------+----------------+----------------+----------------+


In [28]:
# set-up generic grid-search cv function
def gridsearch_pipeline(pipeline_name, train_data, train_labels, test_data, pipeline_steps, parameters):
    """
    generic function to run gridsearchcv on an input dataset, pipeline, and parameters
    Args:
        data separated into features/labels and train/test
        steps of the pipeline
        parameters for gridsearchcv
    Returns:
        best parameters from gridsearch, prediction for test features
    """
    #pipeline
    pipe = sklearn.pipeline.Pipeline(pipeline_steps)
    
    #gridsearch
    cv = sklearn.grid_search.GridSearchCV(pipe, param_grid=parameters)
    cv.fit(train_data, train_labels)
    pred = cv.predict(test_data)
    print cv.best_params_
    return pred

In [29]:
#Put together pieces of classifier

#tf-idf vectorizer
vectorizer1 = TfidfVectorizer(sublinear_tf=True)
vectorizer2 = TfidfVectorizer(max_df = 1, min_df = 0, sublinear_tf=True)
vectorizer3 = TfidfVectorizer(ngram_range = (1,3), sublinear_tf=True)
vectorizer4 = TfidfVectorizer(max_df = 0.8, min_df = 0.2, ngram_range = (1,3), sublinear_tf=True)

#feature selection
select = SelectPercentile(f_classif)
pca = PCA()
feature_selection = FeatureUnion([('select', select), ('pca', pca)],
                    transformer_weights={'pca': 10})

#classifier
clfNB = GaussianNB()
clfAdaBoost = AdaBoostClassifier(random_state = 42)
clfLR = LogisticRegression(random_state=42, solver='sag')
clfSVM = SGDClassifier(loss='modified_huber', penalty='l2', n_iter=200, random_state=42)

#### Testing Classifiers - unigrams only

In [30]:
#test2 - GaussianNB, simple vectorizer, PCA
steps = [
         ('feature_pick', pca),
         ('classifier', clfNB)]

params = dict(feature_pick__n_components=[100, 200, 500])

prediction = gridsearch_pipeline('test2', text_train_transformed, labels_train, text_test_transformed, steps, params)
scoring_function('test2', labels_test, prediction)
print scoring_table

{'feature_pick__n_components': 500}
Validation Metrics for test2: accuracy: 0.621160409556, precision: 0.591690544413, recall: 0.722027972028, auc: 0.623513986014
+---------------+----------------+----------------+----------------+----------------+
| pipeline_name |    accuracy    |   precision    |     recall     |      auc       |
+---------------+----------------+----------------+----------------+----------------+
|     test1     | 0.60409556314  | 0.558441558442 | 0.902097902098 | 0.611048951049 |
|     test2     | 0.621160409556 | 0.591690544413 | 0.722027972028 | 0.623513986014 |
+---------------+----------------+----------------+----------------+----------------+


In [31]:
#test3 - GaussianNB, simple vectorizer, selectPercentile
steps = [
         ('feature_pick', select),
         ('classifier', clfNB)]

params = dict(feature_pick__percentile=[7, 10, 15])

prediction = gridsearch_pipeline('test3', text_train_transformed, labels_train, text_test_transformed, steps, params)
scoring_function('test3', labels_test, prediction)

{'feature_pick__percentile': 10}
Validation Metrics for test3: accuracy: 0.695392491468, precision: 0.681895093063, recall: 0.704545454545, auc: 0.695606060606


<prettytable.PrettyTable at 0x1f7db9b0>

In [32]:
#test4 - GaussianNB, simple vectorizer, Feature Union
steps = [
         ('feature_selection', feature_selection),
         ('classifier', clfNB)]

params = dict(feature_selection__select__percentile=[5, 10, 15], 
              feature_selection__pca__n_components=[50, 100, 200])

prediction = gridsearch_pipeline('test4', text_train_transformed, labels_train, text_test_transformed, steps, params)
scoring_function('test4', labels_test, prediction)

{'feature_selection__select__percentile': 10, 'feature_selection__pca__n_components': 50}
Validation Metrics for test4: accuracy: 0.698805460751, precision: 0.678629690049, recall: 0.727272727273, auc: 0.69946969697


<prettytable.PrettyTable at 0x1f7db9b0>

In [33]:
#test5 - AdaBoost, simple vectorizer, PCA
steps = [
         ('feature_pick', pca),
         ('classifier', clfAdaBoost)]

params = dict(feature_pick__n_components=[100, 200, 500],
              classifier__n_estimators=[10, 20, 50],
              classifier__learning_rate=[0.1, 1, 1.5])

prediction = gridsearch_pipeline('test5', text_train_transformed, labels_train, text_test_transformed, steps, params)
scoring_function('test5', labels_test, prediction)

{'classifier__learning_rate': 1, 'classifier__n_estimators': 50, 'feature_pick__n_components': 200}
Validation Metrics for test5: accuracy: 0.602389078498, precision: 0.589225589226, recall: 0.611888111888, auc: 0.602610722611


<prettytable.PrettyTable at 0x1f7db9b0>

In [34]:
#tes6 - AdaBoost, simple vectorizer, selectPercentile
steps = [
         ('feature_pick', select),
         ('classifier', clfAdaBoost)]

params = dict(feature_pick__percentile=[5, 10, 20],
              classifier__n_estimators=[10, 20, 50],
              classifier__learning_rate=[0.1, 1, 1.5])

prediction = gridsearch_pipeline('test6', text_train_transformed, labels_train, text_test_transformed, steps, params)
scoring_function('test6', labels_test, prediction)

{'classifier__learning_rate': 1, 'classifier__n_estimators': 50, 'feature_pick__percentile': 5}
Validation Metrics for test6: accuracy: 0.65614334471, precision: 0.606289308176, recall: 0.842657342657, auc: 0.660495337995


<prettytable.PrettyTable at 0x1f7db9b0>

In [35]:
#test7 - adaboost, simple vectorizer, Feature Union
steps = [
         ('feature_selection', feature_selection),
         ('classifier', clfAdaBoost)]

params = dict(feature_selection__select__percentile=[5, 10, 15], 
              feature_selection__pca__n_components=[50, 100, 200],
              classifier__n_estimators=[10, 20, 50],
              classifier__learning_rate=[0.1, 1, 1.5])

prediction = gridsearch_pipeline('test7', text_train_transformed, labels_train, text_test_transformed, steps, params)
scoring_function('test7', labels_test, prediction)

{'classifier__learning_rate': 1, 'classifier__n_estimators': 50, 'feature_selection__pca__n_components': 100, 'feature_selection__select__percentile': 10}
Validation Metrics for test7: accuracy: 0.662969283276, precision: 0.635114503817, recall: 0.727272727273, auc: 0.66446969697


<prettytable.PrettyTable at 0x1f7db9b0>

In [36]:
#test8 - svm, simple vectorizer, PCA
steps = [
         ('feature_pick', pca),
         ('classifier', clfSVM)]

params = dict(feature_pick__n_components=[100, 200, 500],
              classifier__alpha=[0.0001, 0.00001, 0.001])

prediction = gridsearch_pipeline('test8', text_train_transformed, labels_train, text_test_transformed, steps, params)
scoring_function('test8', labels_test, prediction)

{'classifier__alpha': 0.001, 'feature_pick__n_components': 500}
Validation Metrics for test8: accuracy: 0.692832764505, precision: 0.68661971831, recall: 0.681818181818, auc: 0.692575757576


<prettytable.PrettyTable at 0x1f7db9b0>

In [37]:
#test9 - svm, simple vectorizer, selectPercentile
steps = [
         ('feature_pick', select),
         ('classifier', clfSVM)]

params = dict(feature_pick__percentile=[5, 10, 15],
              classifier__alpha=[0.0001, 0.00001, 0.001])

prediction = gridsearch_pipeline('test9', text_train_transformed, labels_train, text_test_transformed, steps, params)
scoring_function('test9', labels_test, prediction)

{'feature_pick__percentile': 15, 'classifier__alpha': 0.001}
Validation Metrics for test9: accuracy: 0.704778156997, precision: 0.692176870748, recall: 0.711538461538, auc: 0.704935897436


<prettytable.PrettyTable at 0x1f7db9b0>

In [38]:
#test10 - svm, simple vectorizer, Feature Union
steps = [
         ('feature_selection', feature_selection),
         ('classifier', clfSVM)]

params = dict(feature_selection__select__percentile=[5, 10, 15], 
              feature_selection__pca__n_components=[100, 200, 500],
              classifier__alpha=[0.0001, 0.00001, 0.001])

prediction = gridsearch_pipeline('test10', text_train_transformed, labels_train, text_test_transformed, steps, params)
scoring_function('test10', labels_test, prediction)

{'feature_selection__pca__n_components': 200, 'classifier__alpha': 0.001, 'feature_selection__select__percentile': 10}
Validation Metrics for test10: accuracy: 0.684300341297, precision: 0.672945205479, recall: 0.687062937063, auc: 0.684364801865


<prettytable.PrettyTable at 0x1f7db9b0>

GaussianNB with select percentile and SVM give the highest accuracy and auc scores. I'll optimize these with tfidf vectorizer

In [51]:
#new vectorizer
text_train_transformed2 = vectorizer2.fit_transform(text_train).toarray()
text_test_transformed2  = vectorizer2.transform(text_test).toarray()
print len(text_train_transformed2)

4688


In [52]:
#test11 - GaussianNB, vectorizer with frequency cutoffs, Feature Union
steps = [
         ('feature_pick', select),
         ('classifier', clfNB)]

params = dict(feature_pick__percentile=[7, 10, 15])

prediction = gridsearch_pipeline('test11', text_train_transformed2, labels_train, text_test_transformed2, steps, params)
scoring_function('test11', labels_test, prediction)

   39   41   42   44   46   47   50   51   52   53   54   56   59   60   61
   62   64   65   70   74   79   87   88   90   92   98  100  105  107  108
  112  116  118  119  120  123  134  135  136  137  138  139  141  146  147
  149  156  158  160  162  164  167  172  174  177  179  180  183  184  185
  189  190  191  196  200  201  204  206  207  208  209  215  216  218  228
  231  235  238  239  240  241  244  246  249  255  256  263  267  269  271
  272  273  275  276  277  278  279  285  287  289  291  292  299  300  302
  306  308  309  310  314  315  317  319  320  321  322  324  325  326  327
  329  337  338  342  343  346  349  352  356  359  363  369  371  376  378
  379  383  387  388  399  400  401  409  410  413  417  420  425  426  436
  440  441  446  447  449  451  452  458  459  462  467  468  470  471  477
  482  484  487  488  492  500  501  503  507  509  511  515  519  520  527
  528  534  539  541  544  547  549  553  556  557  558  560  561  563  564
  569  572  

{'feature_pick__percentile': 7}
Validation Metrics for test11: accuracy: 0.48976109215, precision: 0.488715277778, recall: 0.984265734266, auc: 0.5012995338


<prettytable.PrettyTable at 0x1f7db9b0>

In [55]:
#test12 - svm, vectorizer with frequency cutoffs, selectPercentile
steps = [
         ('feature_pick', select),
         ('classifier', clfSVM)]

params = dict(feature_pick__percentile=[5, 10, 15],
              classifier__alpha=[0.0001, 0.00001, 0.001])

prediction = gridsearch_pipeline('test12', text_train_transformed2, labels_train, text_test_transformed2, steps, params)
scoring_function('test12', labels_test, prediction)

{'feature_pick__percentile': 10, 'classifier__alpha': 0.001}
Validation Metrics for test12: accuracy: 0.511945392491, precision: 0.0, recall: 0.0, auc: 0.5


<prettytable.PrettyTable at 0x1f7db9b0>

In [54]:
#test13 - svm, vectorizer with frequency cutoffs, Feature Union
steps = [
         ('feature_selection', feature_selection),
         ('classifier', clfSVM)]

params = dict(feature_selection__select__percentile=[5, 10, 15], 
              feature_selection__pca__n_components=[100, 200, 500],
              classifier__alpha=[0.0001, 0.00001, 0.001])

prediction = gridsearch_pipeline('test13', text_train_transformed2, labels_train, text_test_transformed2, steps, params)
scoring_function('test13', labels_test, prediction)

{'feature_selection__pca__n_components': 100, 'classifier__alpha': 0.001, 'feature_selection__select__percentile': 5}
Validation Metrics for test13: accuracy: 0.501706484642, precision: 0.494252873563, recall: 0.902097902098, auc: 0.511048951049


<prettytable.PrettyTable at 0x1f7db9b0>

Adding frequency cut-off decreased classifier performance. This may be because stop words were already removed from the data in pre-processing. Now try n-grams with stop words included.

#### Testing classifier n-grams

In [43]:
#import data
text_nostop = pickle.load(open("bow_processed_text_nostop.pkl", "r"))

#train/test split of data (randomized)
text_train_nostop, text_test_nostop, labels_train_nostop, labels_test_nostop = cross_validation.train_test_split(text_nostop, labels, test_size=0.2, random_state=42)

In [44]:
#vectorizer with uni-, bi-, and tri-grams
text_train_transformed_nostop = vectorizer3.fit_transform(text_train_nostop).toarray()
text_test_transformed_nostop  = vectorizer3.transform(text_test_nostop).toarray()
print len(text_train_transformed_nostop)

4688


In [45]:
#test14 - GaussianNB, vectorizer with ngrams, Feature Union
steps = [
         ('feature_pick', select),
         ('classifier', clfNB)]

params = dict(feature_pick__percentile=[7, 10, 15])

prediction = gridsearch_pipeline('test14', text_train_transformed_nostop, labels_train, text_test_transformed_nostop, steps, params)
scoring_function('test14', labels_test, prediction)



{'feature_pick__percentile': 7}
Validation Metrics for test15: accuracy: 0.700511945392, precision: 0.693520140105, recall: 0.692307692308, auc: 0.700320512821


<prettytable.PrettyTable at 0x1f7db9b0>

In [10]:
#test15 - svm, vectorizer with ngrams, selectPercentile
steps = [
         ('feature_pick', select),
         ('classifier', clfSVM)]

params = dict(feature_pick__percentile=[12, 15, 17],
              classifier__alpha=[0.0001, 0.00001, 0.001])

prediction = gridsearch_pipeline('test15', text_train_transformed_nostop, labels_train_nostop, text_test_transformed_nostop, steps, params)
scoring_function('test15', labels_test_nostop, prediction)



{'feature_pick__percentile': 15, 'classifier__alpha': 0.0001}
Validation Metrics for test16: accuracy: 0.767918088737, precision: 0.776752767528, recall: 0.736013986014, auc: 0.767173659674


<prettytable.PrettyTable at 0xb14d4e0>

In [12]:
#test16 - svm, vectorizer with ngrams, Feature Union
steps = [
         ('feature_selection', feature_selection),
         ('classifier', clfSVM)]

params = dict(feature_selection__select__percentile=[10, 15], 
              feature_selection__pca__n_components=[100, 200, 500],
              classifier__alpha=[0.0001, 0.001])

prediction = gridsearch_pipeline('test16', text_train_transformed_nostop, labels_train_nostop, text_test_transformed_nostop, steps, params)
scoring_function('test16', labels_test_nostop, prediction)

{'feature_selection__pca__n_components': 100, 'classifier__alpha': 0.0001, 'feature_selection__select__percentile': 10}
Validation Metrics for test17: accuracy: 0.756825938567, precision: 0.784158415842, recall: 0.692307692308, auc: 0.755320512821


<prettytable.PrettyTable at 0xb0fd898>

In [46]:
#try reducing dimensionality of matrix
text_train_transformed_nostop2 = vectorizer4.fit_transform(text_train_nostop).toarray()
text_test_transformed_nostop2  = vectorizer4.transform(text_test_nostop).toarray()

In [47]:
#test17 - svm, vectorizer with ngrams, selectPercentile
steps = [
         ('feature_pick', select),
         ('classifier', clfSVM)]

params = dict(feature_pick__percentile=[12, 15, 17],
              classifier__alpha=[0.0001, 0.00001, 0.001])

prediction = gridsearch_pipeline('test17', text_train_transformed_nostop2, labels_train, text_test_transformed_nostop2, steps, params)
scoring_function('test17', labels_test, prediction)

{'feature_pick__percentile': 15, 'classifier__alpha': 0.001}
Validation Metrics for test18: accuracy: 0.548634812287, precision: 0.539305301645, recall: 0.515734265734, auc: 0.547867132867


<prettytable.PrettyTable at 0x1f7db9b0>

In [50]:
#export table
data = scoring_table.get_string()

with open('scoring_table.txt', 'wb') as f:
    f.write(data)

#### Export classifier 

In [60]:
os.chdir(SAVE_PATH)

from sklearn.pipeline import Pipeline

#export test 15
model = Pipeline([ 
    ('vectorize', TfidfVectorizer(ngram_range = (1,3), sublinear_tf=True)), 
    ('select', SelectPercentile(f_classif, percentile=15)), 
    ('classify', SGDClassifier(loss='modified_huber', penalty='l2', n_iter=200, random_state=42, alpha=0.0001)), 
])

# train the pipeline (note this calls fit_transform on all transformers and fit on the final estimator) 
model.fit(text_train_nostop, labels_train) 

# save the entire model 
with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [61]:
#load pickled model
with open('model.pkl', 'rb') as f:
    model = pickle.load(f)

In [63]:
#test model on test data
prediction = model.predict(text_test_nostop)

report = sklearn.metrics.classification_report(labels_test, prediction)
print report

             precision    recall  f1-score   support

          0       0.80      0.76      0.78       600
          1       0.76      0.80      0.78       572

avg / total       0.78      0.78      0.78      1172

