In [22]:
reset -f -s

In [123]:
whos

u'/Users/Alexander'

In [132]:
from __future__ import division
import numpy as np
import pandas as pd
import WordNet_Lexicon as wn
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wnet
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import load_files
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from nltk import RegexpTokenizer
from sklearn import cross_validation
from time import time
from sklearn.svm import SVC
%matplotlib inline

#FLOW
##Preprocessing Data
    - Clearning data
    - Spliting into Training and Test Sets
    - KFold
    
##FIT MODELS
    - Logistic Regression
    - Multinomial Navie Bayes
    - SVM
    
##Ensemble
    - Tune Hyperparameters
    - Resutls
    
##Word Sentiment
    - WordNet
    - Sentiment Lexicon
    
##Appendix
    -Functions
    -Calculations

#Preprocessing Data
    - Clearning data
    - Spliting into Training and Test Sets
    - KFold

In [83]:
# LOAD MOVIE REVIEWS
sentiment = load_files('/Users/Alexander/Downloads/review_polarity/txt_sentoken', random_state=41)

# FILTER OUT EMTPY REVIEWS 
X = []
y = []
for label,data in zip(sentiment.target, sentiment.data):
    if data:
        X.append(data)
        y.append(label)
X  = np.array(X)
y = np.array(y)

# COUNTVECTORIZER/ UNIGRAM DATA
vectorizer = CountVectorizer(stop_words = "english")
vectorizer.fit_transform(X)
X_count = vectorizer.transform(X)

## TF-IDF/ UNIGRAM DATA
n_features = 5000
tfidf = TfidfVectorizer(max_features=n_features,
                        stop_words='english')
X_tfidf = tfidf.fit_transform(X)

## SPLIT CountVect DATA INTO TRAIN AND TEST SETS 
split = 1400
## TRAIN DATA 0.80
Xu1c = X_count[:split]
Xu2c = X_count[split:]

## SPLIT TFIDF DATA INTO TRAIN AND TEST SETS 
## TRAIN DATA 0.80
Xu1tf = X_tfidf[:split]
Xu2tf= X_tfidf[split:]

## TEST DATA 0.20
yu1 = y[:split]
yu2 = y[split:]

#FIT MODELS
    - Logistic Regression
    - Multinomial Navie Bayes
    - SVM


#Hypothesis One:
##TFIDF Vectorizer DOES NOT lead to good model perfomance

$$ TFIDF = ( 1 + log(TF_{t,d}) )~~log( \frac{N}{DF_{t}} )$$

$TF_{t,d} = $ number of occurances of term (t) in a given doc (d)

$DF_{t} =$  number of documents (d) that term (t) appears in 

$N = $ total number of documents in corpus 


In [46]:
# CREATE DATAFRAME TO DISPLAYS METRIC RESULTS
# Acc_count, Acc_TF, F1_count,F1_TF
lr_test  = [0.838, 0.811,0.815,0.759]
mnb_test = [0.794, 0.760, 0.756,0.667]
svc_test = [0.818,0.55205811138,0.787535410765,"error"]
names = ["Logistgic Regression","Multinomial Navie Bayes", "SVC"]
data = [lr_test,mnb_test,svc_test]
table = pd.DataFrame(data, index = names)
table.columns = ["Acc_count","Acc_TF","F1_count","F1_TF"]

In [47]:
#TOKENIZE REVIEWS WITH REGEX IN ORDER TO PRESERVE CONTRACTIONS
tokenizer = RegexpTokenizer(r"[\w']+")
X_tokens = [tokenizer.tokenize(review) for review in X]

count1 = 0
for doc in X_tokens:
    if 'character' in doc:
        count1 +=1
print "'character' appears in {} movie reviews".format(count1)
count2 = 0
for doc in X_tokens:
    if 'dialogue' in doc:
        count2 +=1
print "'dialogue'  appears in {} movie reviews".format(count2)
count3 = 0
for doc in X_tokens:
    if 'annoying' in doc:
        count3 +=1
print "'annoying'  appears in {} movie reviews".format(count3)

'character' appears in 922 movie reviews
'dialogue'  appears in 369 movie reviews
'annoying'  appears in 148 movie reviews


##Conclusion One:

In movie reviews, the most common words tend to be the most descriptive of the reviewer's sentiment, words like 'character', 'annoying, 'dialogue'. Yet, TFIDF will down weight their importance because they tend to have a high document frequency. 

    1 CountVectorizers leads to better predictions than TF-IDF. 
    2 Use CountVectorizer for this data set.

In [48]:
# COMPARING RESULTS FROM COUNT VECTORIZATION AND TF-IDF VECTORIZATION
table

Unnamed: 0,Acc_count,Acc_TF,F1_count,F1_TF
Logistgic Regression,0.838,0.811,0.815,0.759
Multinomial Navie Bayes,0.794,0.76,0.756,0.667
SVC,0.818,0.552058,0.787535,error


#Hypothesis Two:
###Bigrams lead to better model performance because these common words need context to better signal sentiment (I.e. Great Dialouge, Flat Characters)

In [35]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), 
                                    token_pattern=r'\b\w+\b', 
                                    min_df=1)
X_2 = bigram_vectorizer.fit_transform(X).toarray()

In [40]:
## SPLIT CountVect DATA INTO TRAIN AND TEST SETS 
split = 1400
## TRAIN DATA 0.80
Xub1 = X_2[:split]
Xub2= X_2[split:]
## TEST DATA 0.20
yub1 = y[:split]
yub2 = y[split:]

logit = LogisticRegression()
acc_score, pre_score,rec_score = fit_predict_model2(logit,Xub1,yub1)
print logit.__class__.__name__ + "  Training Data"
print("Accuracy: %0.2f (+/- %0.2f) " % (acc_score[0], acc_score[1]))
print("Precison: %0.2f (+/- %0.2f) " % (pre_score[0], pre_score[1]))
print("Recall:   %0.2f (+/- %0.2f)" % (rec_score[0], rec_score[1]))

yhat = logit.predict(Xub2)
print logit.__class__.__name__ + "  Testing Data"
print "Accuracy:  ", accuracy_score(yub2,yhat)
print "F1-Score:  ", f1_score(yub2,yhat)

LogisticRegression  Training Data
Accuracy: 0.81 (+/- 0.01) 
Precison: 0.83 (+/- 0.03) 
Recall:   0.74 (+/- 0.03)
LogisticRegression  Testing Data
Accuracy:   0.794188861985
F1-Score:   0.752186588921


In [49]:
nb = MultinomialNB()
acc_score1, pre_score1,rec_score1 = fit_predict_model2(nb,Xub1,yub1)
print nb.__class__.__name__ + "  Training Data"
print("Accuracy: %0.2f (+/- %0.2f) " % (acc_score1[0], acc_score1[1]))
print("Precison: %0.2f (+/- %0.2f) " % (pre_score1[0], pre_score1[1]))
print("Recall:   %0.2f (+/- %0.2f)" % (rec_score1[0], rec_score1[1]))

yhat1 = nb.predict(Xub2)
print nb.__class__.__name__ + "  Testing Data"
print "Accuracy:  ", accuracy_score(yub2,yhat1)
print "F1-Score:  ", f1_score(yub2,yhat1)

MultinomialNB  Training Data
Accuracy: 0.80 (+/- 0.02) 
Precison: 0.85 (+/- 0.06) 
Recall:   0.69 (+/- 0.08)
MultinomialNB  Testing Data
Accuracy:   0.769975786925
F1-Score:   0.677966101695


#Conclusion Two:
    1 Bigrams DO NOT lead to better predictions, in fact the predictions are slightly worse. 
    2 Word Sentiment may not be captured in Bigrams 
    3 N-grams greater than N = 1 may be overfitting the model

#Consider...

##Models have instrinsic bias based on the underlying assumptions 

$$ Bias[\hat{f}(x)] = E[\hat{f}(x)]~-~f(x) $$

###Logistic Regression
    - Exponentiated beta's provide odds of a data point belongs to a certain class
$$odds = \frac{P(Y = 1)}{1 - P(Y = 1)} = e^{\beta_0 + \beta_1 X_1 + \cdots + \beta_p X_p}$$

###SVM
    - Data can be linearly seperated by a decision boundary
$$ |\beta_0 + \textbf{$\beta^{T}$}\textbf{x}| = 1 $$


###Multinomial Navie Bayes
    - Assumes that word occurances are independent of each other
$$P_{\theta}(C=c_{i}|x_{1}, x_{2}, ... , x_{n}) \propto P_{\theta}(x_{1} | c) P_{\theta}(c) \ldots P_{\theta}(x_{d} | c) P_{\theta}(c)$$
###Discriminative vs. Generative
    - Discriminative assume that classification can best be done by conditioning predictions prob on data: P(Y|X)
    - Generative assume that classification can best be done by joining predictions prob with data: P(Y,X)


#Ensemble

##Hypothesis Three:
    The misclassifiying effects of model bias can be mitigated by ensembling. 

##Two Types of Ensembling
###Type 1: Majority Vote (Hard Voting)
    - The classification with the most votes will be used for prediction. 
    
$$ \text{Clf}1 \rightarrow 1 \\
   \text{Clf}2 \rightarrow 1 \\
   \text{Clf}3 \rightarrow 2 \\
   \text{label 1 is choosen}$$    
    
    
###Type 2: Weighted Average (Soft Voting)
    - Each model has it's prediction multiplied by an optimized weight. 
    
$$ \text{P}_{i} =~ [w_1 w_2 \dots w_n]  \begin{bmatrix} p_{i, 1} \\ p_{i, 2}\\ \vdots \\ p_{i, n} \end{bmatrix}=~ w_1 \cdot p_{i, 1} + w_2 \cdot p_{i, 2} + \dots + w_n \cdot p_{i, n}~~\forall_{i}$$

In [60]:
# MODELS USED IN ENSEMBLE
cf1 = LogisticRegression()
cf2 = MultinomialNB()
cf3 = SVC(kernel = "linear", 
          C = 0.01, 
          gamma = 0.001, 
          probability=True, 
          cache_size=400)

#OPTIMIZED HYPERPARAMETERS FOR ENSEMBLE
w1,w2,w3 = (3,1,1)

In [77]:
# WEIGHTED AVERAGE (soft voting)
start = time()
np.random.seed(123)
eclf = EnsembleClassifier(clfs=[cf1, cf2, cf3], voting='soft', weights=[w1,w2,w3])

for clf, label in zip([cf1, cf2, cf3, eclf], ['Logistic Regression',  'Multinomial Naive Bayes','SVC', 'Ensemble']):
    acc_score, f1_ = fit_predict_model2(clf,Xu1.todense(), yu1)
    print"\n{}".format(label)
    print"Accuracy: {} ".format(acc_score)
    print"F1_score: {}".format(f1_)
    #     print("Precison: %0.2f (+/- %0.2f) " % (pre_score[0], pre_score[1]))
    #     print("Recall:   %0.2f (+/- %0.2f)" % (rec_score[0], rec_score[1]))
    end = time()
    print "Time Elapsed: {} minutes".format((end - start)/60)


Logistic Regression
Accuracy: 0.838571428571 
F1_score: 0.815678572086
Time Elapsed: 0.181627333164 minutes

Multinomial Naive Bayes
Accuracy: 0.809285714286 
F1_score: 0.778036877807
Time Elapsed: 0.302635165056 minutes

SVC
Accuracy: 0.832142857143 
F1_score: 0.807047994421
Time Elapsed: 52.9176993132 minutes

Ensemble
Accuracy: 0.849285714286 
F1_score: 0.827092021923
Time Elapsed: 107.991673565 minutes


In [114]:
# CREATE DATAFRAME TO DISPLAYS METRIC RESULTS

ensemble_acc = np.array([0.838,0.809,0.832,0.8492]).reshape(1,4)
ensemble_f1 = np.array([0.815, 0.778,0.807,0.827]).reshape(1,4)
names = ["Accuracy","f1_Score"]
data2 = np.array([ensemble_acc, ensemble_f1]).reshape(2,4)
table2 = pd.DataFrame(data2, index = names)
table2.columns = ["Logistgic Regression","Multinomial Navie Bayes", "SVC", "Ensemble"]
table2

Unnamed: 0,Logistgic Regression,Multinomial Navie Bayes,SVC,Ensemble
Accuracy,0.838,0.809,0.832,0.8492
f1_Score,0.815,0.778,0.807,0.827


##Conclusion Three:
###1 There is not a meaningful increase in scoring metrics through ensembling.
###2 It seems that no more signel can be extracted from the Unigram data set. 
###3 In order to extract more signel from the data, further feature engineering is necessary. 


##Word Sentiment
    - Create Word Seeds, pass into Synset seed generator
    - Pass seed Synsets into Synset propagator
    - Generate Positive and Negative Lexicon of word sentiments

#Hypothesis FOUR:
###The sentiment in individual words will provide a refined analysis of sentiment. Which will lead to better predictions. 

In [119]:
## CREATE SEED WORDS THAT WILL BE USED TO FIND SYNSETS AND BUILD A POSITIVE 
## AND NEGATIVE LEXICON
seed_pos = ["excellent","intense",\
            "good","outstanding",\
            "badass","positive",\
            "strong","reliable",\
           "happy","thoughtfull",\
           "cool","great",\
            "brilliant","exciting"\
           "creative","imaginative"]

seed_neg = ["negative","flat",\
            "boring","annoying",\
            "poor","weak",\
            "rehash","lazy",\
            "unimaginative","uncreative",\
            "shit","stupid"\
           "wrong","unimaginative",\
           "monotone","dumb"]

In [125]:
## DIFFERENT SENSES OF EACH WORD
senses = 4
pos_list, neg_list = create_lexicon(seed_pos,seed_neg, senses )

creating lexicon...
created synset lexicon in 8.14681601524 seconds


In [126]:
posWords, negWords = synsets_to_tokens(pos_list, neg_list )

transforming synsets to tokens...
transfored synsets into tokens in 32.479667902 seconds
Positive words: 6110,  Negative words: 4557


In [4]:
s = set()
s.

In [134]:
start2 = time()
review_sentiment = []
review_index = 0
stopWords = stopwords.words()
## TOKENIZE REVIEW AND REMOVE STOPWORDS
X_token = []
for x in X:
    review_temp = []
    review_token = RegexpTokenizer("[\w]+").tokenize(x)
    for token in review_token:
        if token not in stopWords:
            review_temp.append(token)
    X_token.append(review_temp)
    
## CHECK EACH WORD IN EACH REVIEW FOR SENTIMENT 
for review in X_token:
    word_sentiment = []
    words_in_review = len(review)
    for word in review:
        ## WORD FREQUENCY IN REVIEW
        word_i_count_in_review= Counter(review)[word]
        ## IF WORD IN NEG LEXICON, THEN MULTIPLY BY -1
        if word in negWords:
            word_sentiment.append(word_i_count_in_review/words_in_review * -1)
        elif word in posWords:
            word_sentiment.append(word_i_count_in_review/words_in_review)
    end2 = time()
    if review_index >0:
        if (review_index%100 == 0):
            end3 = time()
            print "\nreview: ",review_index
            print "time: {} minutes".format((end2 - start2)/60)
    review_sentiment.append(np.sum(word_sentiment))
    review_index += 1
    
end3 = time()
print "time: {} minutes".format((end3 - start2)/60)


review:  100
time: 2.70994708538 minutes

review:  200
time: 3.30587316751 minutes

review:  300
time: 3.9287823995 minutes

review:  400
time: 4.48367596865 minutes

review:  500
time: 5.02569961548 minutes

review:  600
time: 5.62838586569 minutes

review:  700
time: 6.22958499988 minutes

review:  800
time: 6.79790723324 minutes

review:  900
time: 7.40518773397 minutes

review:  1000
time: 7.98564985196 minutes

review:  1100
time: 8.55944881837 minutes

review:  1200
time: 9.22328660091 minutes

review:  1300
time: 9.77217181921 minutes

review:  1400
time: 10.3539442658 minutes

review:  1500
time: 10.962987868 minutes

review:  1600
time: 11.5504461527 minutes

review:  1700
time: 12.1481976191 minutes

review:  1800
time: 12.7073778351 minutes
time: 12.760322086 minutes


In [135]:
### RELATIVE FREQUENCIES OF UNIGRAMS
split = 1400
## TRAIN DATA 0.80
X1 = review_sentiment[:split]
X2= review_sentiment[split:]
## TEST DATA 0.20
y1 = y[:split]
y2 = y[split:]

# SPLIT TRAIN DATA INTO TRAIN AND DEVELOPMENT SETS
Xtrain,Xtest,ytrain,ytest = train_test_split(X1,y1,test_size = 0.20)
Xtrain = np.array(Xtrain).reshape(len(Xtrain),1)
Xtest = np.array(Xtest).reshape(len(Xtest),1)
# FIT AND PREDICT USING LOGISTIC REGRESSION 
lr = LogisticRegression()
lr.fit(np.array(Xtrain).reshape(1120,1),ytrain)
ypred = lr.predict(Xtest)

In [136]:
print "Accuracy:  ", accuracy_score(ytest,ypred)
print "Precision: ", precision_score(ytest,ypred)
print "Recall:    ", recall_score(ytest,ypred)
print "F1-Score:  ", f1_score(ytest,ypred)

Accuracy:   0.617857142857
Precision:  0.75
Recall:     0.104347826087
F1-Score:   0.18320610687


#Conclusion Four:
###The results are inconclusive. The accuracy is very close to random. Next time, account for words with neutral sentiment.

#Appendix

##Functions

###Classifer Evaluation

In [78]:
def fit_predict_model(model, X,y):
    acc = []
    pre = []
    rec = []

    kf = KFold(n = len(y), n_folds = 10, random_state = 41,shuffle = True)
    
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        acc.append(accuracy_score(y_test,y_pred))
        pre.append(precision_score(y_test,y_pred))
        rec.append(recall_score(y_test,y_pred))
    return model, np.mean(acc),  np.mean(pre), np.mean(rec)

In [79]:
def fit_predict_model2(model, X,y):
    #CREATE LIST TO STORE METRIC SCORES
    acc = []
    pre = []
    rec = []
    f1 = []
    # INITIATE KFOLD TO RANDOMIZE THE DATA INTO 10 TRAIN AND 10 TEST FOLDS
    # RANDOMIZING DATA HELPS PREVENT SCORES FROM BEING DEPENDENT ON A RANDOM SPLIT
    # KFOLD VALIDATION IS CHOOSEN OVER CROSS_VAL_SCORE (CVS)BECAUSE CSV
    #    ONLY ALLOWS FOR ONE SCORING METRIC TO BE USED PER INSTANCE
    # WHERE AS WITH KFOLD, SEVERAL SCORING METRICS CAN BE CALCULATED 
    #    IN ONE INPLEMENTATION OF KFOLD
    kf = KFold(n = len(y), n_folds = 10, random_state = 41,shuffle = True)
    
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train,y_train)
        y_pred = model.predict(X_test)
        
        acc.append(accuracy_score(y_test,y_pred))
        pre.append(precision_score(y_test,y_pred))
        rec.append(recall_score(y_test,y_pred))
        
    return np.mean(acc), 2 *(np.mean(pre)* np.mean(rec))/ (np.mean(pre)+ np.mean(rec))

###Ensembe Classifer Class

In [57]:
from sklearn.base import BaseEstimator
from sklearn.base import ClassifierMixin
from sklearn.base import TransformerMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.externals import six
from sklearn.base import clone
from sklearn.pipeline import _name_estimators
import numpy as np
import operator


class EnsembleClassifier(BaseEstimator, ClassifierMixin, TransformerMixin):
    """ Soft Voting/Majority Rule classifier for unfitted clfs.

    Parameters
    ----------
    clfs : array-like, shape = [n_classifiers]
      A list of classifiers.
      Invoking the `fit` method on the `VotingClassifier` will fit clones
      of those original classifiers that will be stored in the class attribute
      `self.clfs_`.

    voting : str, {'hard', 'soft'} (default='hard')
      If 'hard', uses predicted class labels for majority rule voting.
      Else if 'soft', predicts the class label based on the argmax of
      the sums of the predicted probalities, which is recommended for
      an ensemble of well-calibrated classifiers.

    weights : array-like, shape = [n_classifiers], optional (default=`None`)
      Sequence of weights (`float` or `int`) to weight the occurances of
      predicted class labels (`hard` voting) or class probabilities
      before averaging (`soft` voting). Uses uniform weights if `None`.

    Attributes
    ----------
    classes_ : array-like, shape = [n_predictions]

    Examples
    --------
    >>> import numpy as np
    >>> from sklearn.linear_model import LogisticRegression
    >>> from sklearn.naive_bayes import GaussianNB
    >>> from sklearn.ensemble import RandomForestClassifier
    >>> clf1 = LogisticRegression(random_state=1)
    >>> clf2 = RandomForestClassifier(random_state=1)
    >>> clf3 = GaussianNB()
    >>> X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
    >>> y = np.array([1, 1, 1, 2, 2, 2])
    >>> eclf1 = VotingClassifier(clfs=[clf1, clf2, clf3], voting='hard')
    >>> eclf1 = eclf1.fit(X, y)
    >>> print(eclf1.predict(X))
    [1 1 1 2 2 2]
    >>> eclf2 = VotingClassifier(clfs=[clf1, clf2, clf3], voting='soft')
    >>> eclf2 = eclf2.fit(X, y)
    >>> print(eclf2.predict(X))
    [1 1 1 2 2 2]
    >>> eclf3 = VotingClassifier(clfs=[clf1, clf2, clf3],
    ...                          voting='soft', weights=[2,1,1])
    >>> eclf3 = eclf3.fit(X, y)
    >>> print(eclf3.predict(X))
    [1 1 1 2 2 2]
    >>>
    """
    # INITIATES ENSEMBLE CLASSIFER CLASS
    def __init__(self, clfs, voting='hard', weights=None):
        # STORES LIST OF PASSED IN CLASSIFERS INTO CLASS ATTRIBUTE
        self.clfs = clfs
        self.named_clfs = {key:value for key,value in _name_estimators(clfs)}
        # STORES TYPE OF VOTING INTO CLASS ATTRIBUTE
        self.voting = voting
        # STORES USER PASSED IN CLASSIFER WEIGHTS INTO CLASS ATTRIBUTE
        self.weights = weights
        
    # FITS CLASSIFER
    def fit(self, X, y):
        """ Fit the clfs.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        y : array-like, shape = [n_samples]
            Target values.

        Returns
        -------
        self : object
        """
        # ENSURES THAT LABELS ARE BINARY
        if isinstance(y, np.ndarray) and len(y.shape) > 1 and y.shape[1] > 1:
            raise NotImplementedError('Multilabel and multi-output'\
                                      ' classification is not supported.')
        # ENSURES THAT ENSEMBLE TYPE IS DEFINED
        if self.voting not in ('soft', 'hard'):
            raise ValueError("Voting must be 'soft' or 'hard'; got (voting=%r)"
                             % voting)
        # ENSURES THAT THE NUMBER OF CLASSIFIERS AND WEIGHTS ARE THE SAME
        if self.weights and len(self.weights) != len(self.clfs):
            raise ValueError('Number of classifiers and weights must be equal'
                             '; got %d weights, %d clfs'
                             % (len(self.weights), len(self.clfs)))
        # LabelEncoder IS USED TO TRANSFORM NON-NUMERICAL LABELS
        # TO NUMERICAL LABELS
        self.le_ = LabelEncoder()
        self.le_.fit(y)
        self.classes_ = self.le_.classes_
        self.clfs_ = []
        # CLASSIFIERS ARE CLONED SO THAT FITTING IS NOT DOEN INPLACE
        for clf in self.clfs:
            fitted_clf = clone(clf).fit(X, self.le_.transform(y))
            self.clfs_.append(fitted_clf)
        return self

    def predict(self, X):
        """ Predict class labels for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        ----------
        maj : array-like, shape = [n_samples]
            Predicted class labels.
        """
        if self.voting == 'soft':

            maj = np.argmax(self.predict_proba(X), axis=1)

        else:  # 'hard' voting
            predictions = self._predict(X)

            maj = np.apply_along_axis(
                                      lambda x:
                                      np.argmax(np.bincount(x,
                                                weights=self.weights)),
                                      axis=1,
                                      arr=predictions)

        maj = self.le_.inverse_transform(maj)
        return maj

    def predict_proba(self, X):
        """ Predict class probabilities for X.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        ----------
        avg : array-like, shape = [n_samples, n_classes]
            Weighted average probability for each class per sample.
        """
        # CALCULATES THE WEIGHTED AVERAGE FOR EACH CLASS
        avg = np.average(self._predict_probas(X), axis=0, weights=self.weights)
        return avg

    def transform(self, X):
        """ Return class labels or probabilities for X for each estimator.

        Parameters
        ----------
        X : {array-like, sparse matrix}, shape = [n_samples, n_features]
            Training vectors, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        -------
        If `voting='soft'`:
          array-like = [n_classifiers, n_samples, n_classes]
            Class probabilties calculated by each classifier.
        If `voting='hard'`:
          array-like = [n_classifiers, n_samples]
            Class labels predicted by each classifier.
        """
        if self.voting == 'soft':
            return self._predict_probas(X)
        else:
            return self._predict(X)

    def get_params(self, deep=True):
        """ Return estimator parameter names for GridSearch support"""
        if not deep:
            return super(EnsembleClassifier, self).get_params(deep=False)
        else:
            out = self.named_clfs.copy()
            for name, step in six.iteritems(self.named_clfs):
                for key, value in six.iteritems(step.get_params(deep=True)):
                    out['%s__%s' % (name, key)] = value
            return out

    def _predict(self, X):
        """ Collect results from clf.predict calls. """
        return np.asarray([clf.predict(X) for clf in self.clfs_]).T

    def _predict_probas(self, X):
        """ Collect results from clf.predict calls. """
        return np.asarray([clf.predict_proba(X) for clf in self.clfs_])

###Weight Optimizer for the Ensemble Classifer

In [93]:
## BRUTE FORCE: COMPARE THE MEAN AVERAGE OF ALL POSSIBLE WEIGHTS (FROM 1 TO 4) FOR ALL 3 CLASSIFIERS 
np.random.seed(123)
start5 = time()
df = pd.DataFrame(columns=('w1', 'w2','mean', 'std'))

i = 0
for w1 in range(1,4):
    for w2 in range(1,4):
        for w3 in range(1,4):
            if len(set((w1,w2))) == 1: # skip if all weights are equal
                continue
            
            eclf = EnsembleClassifier(clfs=[cf1, cf2], weights=[w1,w2])
            scores = cross_validation.cross_val_score(
                                            estimator=eclf,
                                            X=Xu1c.todense(), 
                                            y=yu1, 
                                            cv=3, 
                                            scoring='accuracy',
                                            n_jobs=1)
            
            df.loc[i] = [w1, w2, scores.mean(), scores.std()]
            i += 1
end5 = time()
print "time elapsed: {}".format(end5 - start5)
df.sort(columns=['mean', 'std'], ascending=False)

###Create Positive and Negative Synset Lexicon

In [116]:
def create_lexicon(pos_seed,neg_seed, num_senses):
    '''
    Input:  positive word seeds (list)
            negative word seeds (list)
            num of word senses to capture in synsets (int)
    Output: positive word list
            negative word list
    '''
    print "creating lexicon..."
    start = time()
    # PASS IN SEED WORDS TO CREATE SEED SYNETS
    pos_words = wn.create_seed_synsets(pos_seed)
    neg_words = wn.create_seed_synsets(neg_seed)
    # PASS IN SEED SYNSETS TO SENSE PROPAGATER
    pos = wn.wordnet_sense_propagate(pos_words, num_senses)
    neg = wn.wordnet_sense_propagate(neg_words, num_senses)
    end = time()
    print "created synset lexicon in {} seconds".format(end - start)
    # RETURN POSITIVE AND NEGATIVE SYNSETS LEXICON 
    return pos,neg

###Create Positive and Negative Word Lexicon

In [117]:
def synsets_to_tokens(pos,neg):
    '''
    Input:  positive synsets (list)
            negative synsets (list)
    Ouput:  positive lexicon (list)
            negative lexicon (list)
    '''
    print "transforming synsets to tokens..."
    start1 = time()
    build_pos = []
    lex_pos  = []
    for synset_list in pos:
        for synsets in synset_list:
            for synset in synsets:
                if synset.name not in build_pos:
                    build_pos.append(synset.name)
                    lex_pos.append(synset.lemma_names())
                    
    build_neg = []
    lex_neg  = []
    for synset_list in neg:
        for synsets in synset_list:
            for synset in synsets:
                if synset.name not in build_neg:
                    build_neg.append(synset.name)
                    lex_neg.append(synset.lemma_names())
                    
    neg_words = []
    for synsets in lex_neg:
        for word in synsets:
            if word not in neg_words:
                neg_words.append(word)
                
    pos_words = []
    for synsets in lex_pos:
        for word in synsets:
            if word not in pos_words:
                pos_words.append(word)
                
    end1 = time()
    print "transfored synsets into tokens in {} seconds"\
    .format(end1 - start1)
    print "Positive words: {0},  Negative words: {1}"\
    .format(len(pos_words),len(neg_words))
    
    return pos_words, neg_words

###Create Lexicon of Positive and Negative Synsets

In [120]:
def create_lexicon(pos_seed,neg_seed, num_senses):
    '''
    Input:  positive word seeds (list)
            negative word seeds (list)
            num of word senses to capture in synsets (int)
    Output: positive word list
            negative word list
    '''
    print "creating lexicon..."
    start = time()
    # PASS IN SEED WORDS TO CREATE SEED SYNETS
    pos_words = wn.create_seed_synsets(pos_seed)
    neg_words = wn.create_seed_synsets(neg_seed)
    # PASS IN SEED SYNSETS TO SENSE PROPAGATER
    pos = wn.wordnet_sense_propagate(pos_words, num_senses)
    neg = wn.wordnet_sense_propagate(neg_words, num_senses)
    end = time()
    print "created synset lexicon in {} seconds".format(end - start)
    # RETURN POSITIVE AND NEGATIVE LEXICON 
    return pos,neg

##Calculations
###Applying CountVectorizer and TFIDF vectorizer to data and modeling

In [84]:
## LOGISTIC REGRESSION TRAINING ON COUNTVECTORIZER
clf1 = LogisticRegression()
clf1, ave_acc , ave_pre, ave_rec = fit_predict_model(clf1,Xu1c.todense(),yu1)
print clf1.__class__.__name__ + "  TRAINING SET"
print "Ave Accuracy:  ", ave_acc
print "Ave Precision: ", ave_pre
print "Ave Recall:    ", ave_rec

LogisticRegression  TRAINING SET
Ave Accuracy:   0.838571428571
Ave Precision:  0.833515383979
Ave Recall:     0.798589163817


In [85]:
## LOGISTIC REGRESSION TESTING ON COUNTVECTORIZER
ypred1 = clf1.predict(Xu2c.todense())
print clf1.__class__.__name__ + "  TESTING SET"
print "Accuracy:  ", accuracy_score(yu2,ypred1)
print "Precision: ", precision_score(yu2,ypred1)
print "Recall:    ", recall_score(yu2,ypred1)
print "F1-Score:  ", f1_score(yu2,ypred1)

LogisticRegression  TESTING SET
Accuracy:   0.837772397094
Precision:  0.831460674157
Recall:     0.8
F1-Score:   0.815426997245


In [86]:
## LOGISTIC REGRESSION TRAINING ON TF-IDF
clf2 = LogisticRegression()
clf2, ave_acc , ave_pre, ave_rec = fit_predict_model(clf2,Xu1tf.todense(),yu1)
print clf2.__class__.__name__ + "  TRAINING SET"
print "Ave Accuracy:  ", ave_acc
print "Ave Precision: ", ave_pre
print "Ave Recall:    ", ave_rec

LogisticRegression  TRAINING SET
Ave Accuracy:   0.818571428571
Ave Precision:  0.859259890135
Ave Recall:     0.708849388884


In [87]:
## LOGISTIC REGRESSION TESTING ON TF-IDF
ypred2 = clf2.predict(Xu2tf.todense())
print clf2.__class__.__name__ + "  TESTING SET"
print "Accuracy:  ", accuracy_score(yu2,ypred2)
print "Precision: ", precision_score(yu2,ypred2)
print "Recall:    ", recall_score(yu2,ypred2)
print "F1-Score:  ", f1_score(yu2,ypred2)

LogisticRegression  TESTING SET
Accuracy:   0.811138014528
Precision:  0.884892086331
Recall:     0.664864864865
F1-Score:   0.759259259259


In [88]:
## MULTINOMIAL NAVIE BAYES TRAINING ON COUNTVECTORIZER
clf3 = MultinomialNB()
clf3, ave_acc , ave_pre, ave_rec = fit_predict_model(clf3,Xu1c.todense(),yu1)
print clf3.__class__.__name__ + "  TRAINING SET"
print "Ave Accuracy:  ", ave_acc
print "Ave Precision: ", ave_pre
print "Ave Recall:    ", ave_rec

MultinomialNB  TRAINING SET
Ave Accuracy:   0.809285714286
Ave Precision:  0.815175460168
Ave Recall:     0.744134835573


In [89]:
## MULTINOMIAL NAVIE BAYES TESTING ON COUNTVECTORIZER
ypred3 = clf3.predict(Xu2c.todense())
print clf3.__class__.__name__ + "  TESTING SET"
print "Accuracy:  ", accuracy_score(yu2,ypred3)
print "Precision: ", precision_score(yu2,ypred3)
print "Recall:    ", recall_score(yu2,ypred3)
print "F1-Score:  ", f1_score(yu2,ypred3)

MultinomialNB  TESTING SET
Accuracy:   0.794188861985
Precision:  0.80487804878
Recall:     0.713513513514
F1-Score:   0.756446991404


In [90]:
## MULTINOMIAL NAVIE BAYES TRAINING ON TF-IDF
clf4 = MultinomialNB()
clf4, ave_acc , ave_pre, ave_rec = fit_predict_model(clf4,Xu1tf.todense(),yu1)
print clf4.__class__.__name__ + "  TRAINING SET"
print "Ave Accuracy:  ", ave_acc
print "Ave Precision: ", ave_pre
print "Ave Recall:    ", ave_rec

MultinomialNB  TRAINING SET
Ave Accuracy:   0.781428571429
Ave Precision:  0.87728525014
Ave Recall:     0.59730114787


In [91]:
## MULTINOMIAL NAVIE BAYES TESTING ON TF-IDF
ypred4 = clf4.predict(Xu2tf.todense())
print clf4.__class__.__name__ + "  TESTING SET"
print "Accuracy:  ", accuracy_score(yu2,ypred4)
print "Precision: ", precision_score(yu2,ypred4)
print "Recall:    ", recall_score(yu2,ypred4)
print "F1-Score:  ", f1_score(yu2,ypred4)

MultinomialNB  TESTING SET
Accuracy:   0.760290556901
Precision:  0.883928571429
Recall:     0.535135135135
F1-Score:   0.666666666667
