In [35]:
import os
import pickle
import sklearn
from sklearn import cross_validation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif, SelectKBest
from sklearn.decomposition import PCA
from sklearn.pipeline import FeatureUnion
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

#set data path
LOCAL_DATA_PATH = 'C:\Users\JoAnna\political_history\processed_data'
os.chdir(LOCAL_DATA_PATH)

In [20]:
#import data
labels = pickle.load(open('bow_labels.pkl', "r"))
text = pickle.load(open('bow_processed_text.pkl', "r"))

In [27]:
#train/test split of data (randomized)
text_train, text_test, labels_train, labels_test = cross_validation.train_test_split(text, labels, test_size=0.2, random_state=42)

4688
1172
4688
let talk import aspect issu we chang cultur america those us proud pro life understand and got courag compass show young woman face terribl difficult decis


In [32]:
#tfidf vectorizer and numpy array
vectorizer = TfidfVectorizer(sublinear_tf=True)
text_train_transformed = vectorizer.fit_transform(text_train).toarray()
text_test_transformed  = vectorizer.transform(text_test).toarray()

#test vectorizer
#print len(vectorizer.get_feature_names())
feature_names = vectorizer.get_feature_names()
#feature_names[5000:5020]

5516


[u'torah',
 u'torn',
 u'tort',
 u'tortur',
 u'total',
 u'touch',
 u'tough',
 u'toughen',
 u'tougher',
 u'toughest',
 u'tour',
 u'tout',
 u'toward',
 u'tower',
 u'town',
 u'toxic',
 u'tpp',
 u'track',
 u'trade',
 u'trader']

TO DO: see how paragraphs cluster with party affiliation after TF-IDF  
http://stackoverflow.com/questions/28160335/plot-a-document-tfidf-2d-graph

In [36]:
#build classifier pipeline
select = SelectKBest()
pca = PCA()
feature_selection = FeatureUnion([('select', select), ('pca', pca)],
                    transformer_weights={'pca': 10})
clf = GaussianNB()

steps = [('feature_selection', feature_selection),
        ('naive_bayes', clf)]

pipeline = sklearn.pipeline.Pipeline(steps)

#search for best parameters
parameters = dict(feature_selection__select__k=[5, 50, 100, 500, 1000], 
              feature_selection__pca__n_components=[10, 50, 100])

cv = sklearn.grid_search.GridSearchCV(pipeline, param_grid=parameters)

cv.fit(text_train_transformed, labels_train)
pred = cv.predict(text_test_transformed)

print cv.best_params_

#pipeline.fit(features_train, labels_train)
#pred = pipeline.predict(features_test)
report = sklearn.metrics.classification_report(labels_test, pred)
print report


   80   85   87   88   90   96   97   99  109  112  113  114  118  121  124
  125  126  131  135  136  146  161  171  189  190  201  203  212  218  229
  239  244  245  246  255  274  276  307  313  324  355  366  371  379  383
  389  394  398  410  411  414  416  431  436  438  446  447  448  452  464
  470  490  495  502  505  515  518  528  529  530  536  537  547  558  561
  564  573  575  578  583  587  589  594  595  596  597  607  612  616  650
  654  664  673  675  678  681  685  690  696  699  700  720  733  734  753
  758  766  768  777  779  786  791  794  795  796  806  809  813  815  817
  843  846  848  850  862  865  866  875  876  880  888  890  895  903  907
  916  917  927  931  932  933  937  959  966  975  983  984 1011 1019 1025
 1033 1044 1049 1051 1067 1083 1085 1088 1100 1106 1108 1111 1116 1128 1129
 1164 1174 1180 1184 1185 1207 1209 1212 1228 1230 1238 1260 1265 1303 1324
 1325 1334 1335 1339 1344 1345 1361 1364 1365 1370 1392 1398 1401 1404 1409
 1419 1429 1

{'feature_selection__select__k': 100, 'feature_selection__pca__n_components': 10}
             precision    recall  f1-score   support

          0       0.71      0.49      0.58       600
          1       0.60      0.79      0.68       572

avg / total       0.65      0.64      0.63      1172



In [37]:
accuracy = sklearn.metrics.accuracy_score(labels_test, pred)
print accuracy

0.63566552901
