# Building model based on Stochastique Gradient Descent

In [1]:
import numpy as np
import pandas as pd
import os
from scipy.stats import randint

#feature extraction modules
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import cross_val_score, train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import StandardScaler, scale
from sklearn.decomposition import TruncatedSVD
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

#confusion matrix
from sklearn.metrics import confusion_matrix

folder = os.getcwd() ; print folder

/home/arda/Documents/DSG


##### importing munged data 

In [2]:
#pd_train = pd.io.pickle.read_pickle(folder + '/data_munged/pd_train_tagged')
#pd_test = pd.io.pickle.read_pickle(folder + '/data_munged/pd_test_tagged')

pd_train = pd.io.pickle.read_pickle(folder + '/data_munged/pd_train_munged')
pd_test = pd.io.pickle.read_pickle(folder + '/data_munged/pd_test_munged')

In [3]:
pd_train['Sentence1'] = pd_train.apply(lambda x:str(x['Sentence'])+' '+str(int(x['nb_caps_word']))+' '+
                                  str(int(x['nb_!']))+' '+str(int(x['nb_,;'])),axis=1)

pd_train['Sentence2'] = pd_train.apply(lambda x:str(x['Sentence'])+' '+str(int(x['nb_caps_char']))+' '+
                                  str(int(x['nb_!']))+' '+str(int(x['nb_,;'])),axis=1)

pd_train['Sentence2'] = pd_train['tagged'].apply(lambda x: ' '.join([' '.join((word,tag)) for (word,tag) in x if len(tag)>1 ]) )

##### data preprocessing

In [4]:
list_pattern = ['\\w{1,}','(?u)\\b\\w+\\b']


stop = None
count = CountVectorizer(lowercase=False, stop_words=stop, token_pattern=list_pattern[1],
                        ngram_range=(1, 3), analyzer=u'word',max_df=1., min_df=2, 
                        max_features=None, vocabulary=None, binary=False)

tfidf = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)

#SVD = TruncatedSVD(n_components=400, algorithm='randomized', n_iter=5, random_state=None, tol=0.0)

StdScl = StandardScaler(copy=True, with_mean=True, with_std=True)

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key].values

transormData = Pipeline([
        ('selector', ColumnSelector(key='Sentence_stemmed')),
        ('count', count),
        ('tfidf', tfidf),
        #('SVD', SVD),
        #('StdScl', StdScl)
    ])

Y = pd_train['Author']
X = transormData.fit_transform(pd_train,Y) ; print X.shape
X_test = transormData.transform(pd_test)

(28723, 107402)




##### models

In [18]:
alphas = np.arange(1e-6,  50e-6, 5e-6)
#alphas = np.arange(1e-6,  50e-6, 5e-6)
alphas = [1e-7, 5e-7, 1e-6, 5e-6, 1e-5, 5e-5]
#alphas = 10.**-np.arange(1,8)
#alphas = np.arange(4e-6, 7e-6, 1e-6)
#alphas = [1e-6]
print alphas

[1e-07, 5e-07, 1e-06, 5e-06, 1e-05, 5e-05]


In [5]:
SGD = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-2, l1_ratio=0.15,
                    fit_intercept=True, n_iter=50, shuffle=True, verbose=0, epsilon=0.1,
                    n_jobs=-1, random_state=None, learning_rate='optimal', eta0=0.0,
                    power_t=0.5, class_weight='auto', warm_start=False, average=False)

results=[]
for i in alphas:
    #sklearn cross validation
  
    SGD.alpha = i
    #NB= MultinomialNB(alpha=i, fit_prior=True, class_prior=None)
    #scores = cross_val_score(CalibratedClassifierCV(SGD,method='sigmoid', cv=3), X,Y,scoring='accuracy',cv=5, n_jobs=-1)
    scores = cross_val_score(SGD, X,Y,scoring='accuracy',cv=5, n_jobs=1)

    print ("Accuracy: %0.4f (+/- %0.4f), alpha: %s" % (scores.mean(), scores.std() * 2, i))

NameError: name 'alphas' is not defined

In [17]:
Accuracy: 0.7609 (+/- 0.0664), alpha: 1e-05
#Accuracy: 0.7662 (+/- 0.0684), alpha: 7e-06

In [37]:
SGD.alpha=1e-05
l = []
for i in range(5):
    scores=[]
    confusion_mat = np.zeros([6,6],dtype=int)
    for cv in range(5):
        x_train, x_val, y_train, y_val = train_test_split(X,Y, test_size=0.2, random_state=np.random.randint(1000))
        
        #clf =CalibratedClassifierCV(SGD,method='sigmoid', cv=3)
        clf =SGD
        
        clf.fit(x_train,y_train)

        
        scores.append(clf.score(x_val,y_val))
        confusion_mat+=confusion_matrix(clf.predict(x_val),y_val)
    l.append(np.mean(scores))
    print ("Accuracy: %0.4f (+/- %0.4f), alpha: %s" % (np.mean(scores), np.std(scores) * 2, i))
    
print clf.classes_
print confusion_mat
print ("Accuracy: %0.4f (+/- %0.4f)" % (np.mean(l), np.std(scores) * 2))

Accuracy: 0.7994 (+/- 0.0037), alpha: 0
Accuracy: 0.8000 (+/- 0.0033), alpha: 1
Accuracy: 0.7983 (+/- 0.0099), alpha: 2
Accuracy: 0.7981 (+/- 0.0079), alpha: 3
Accuracy: 0.8019 (+/- 0.0077), alpha: 4
['austen' 'doyle' 'poe' 'shakespeare' 'twain' 'wilde']
[[8591  439   28   63  276  554]
 [ 441 4085  115   78  333  446]
 [   6    6   44    1    6    0]
 [ 125   84    9 1833  152   60]
 [ 236  425   19   83 5053  331]
 [ 547  518   16   41  251 3430]]
Accuracy: 0.7996 (+/- 0.0077)


In [228]:
clf = SGDClassifier(loss='log', penalty='l2', alpha=7e-06, l1_ratio=0.15,
                    fit_intercept=True, n_iter=20, shuffle=True, verbose=0, epsilon=0.1,
                    n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0,
                    power_t=0.5, class_weight='auto', warm_start=True, average=False)
clf.fit(X,Y)

classes = clf.classes_ ; print classes



features = np.asarray(count.get_feature_names())
results = pd.DataFrame({
        'austen': clf.coef_[0,:],
        'doyle': clf.coef_[1,:],
        'poe': clf.coef_[2,:],
        'shakespeare': clf.coef_[3,:],
        'twain': clf.coef_[4,:],
        'wilde': clf.coef_[5,:],
        'feature': features
    })

['austen' 'doyle' 'poe' 'shakespeare' 'twain' 'wilde']


In [335]:
results['mean'] = results[['austen', 'doyle', 'poe', 'shakespeare', 'twain', 'wilde']].apply(lambda r: np.mean(r), axis=1)

# randomized GRID SEARCH

In [4]:
pd_train['Sentence1'] = pd_train.apply(lambda x:str(x['Sentence'])+' '+str(int(x['nb_caps_word']))+' '+
                                  str(int(x['nb_!']))+' '+str(int(x['nb_,;'])),axis=1)

pd_train['Sentence2'] = pd_train.apply(lambda x:str(x['Sentence'])+' '+str(int(x['nb_caps_char']))+' '+
                                  str(int(x['nb_!']))+' '+str(int(x['nb_,;'])),axis=1)

In [5]:
list_pattern = ['\\w{1,}','(?u)\\b\\w+\\b']
stop = None
count = CountVectorizer(lowercase=False, stop_words=stop, token_pattern=list_pattern[1],
                        ngram_range=(1, 2), analyzer=u'word',max_df=1.0, min_df=2, 
                        max_features=None, vocabulary=None, binary=False)
tfidf = TfidfTransformer(norm='l2', use_idf=True, smooth_idf=True, sublinear_tf=True)

clf = SGDClassifier(loss='hinge', penalty='l2', alpha=7e-06, l1_ratio=0.15,
                    fit_intercept=True, n_iter=20, shuffle=True, verbose=0, epsilon=0.1,
                    n_jobs=1, random_state=None, learning_rate='optimal', eta0=0.0,
                    power_t=0.5, class_weight='auto', warm_start=True, average=False)

class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key].values

pipeline = Pipeline([
    ('column', ColumnSelector(key='Sentence1')),
    ('vect', count),
    ('tfidf', tfidf),
    ('clf', clf),
])

# specify parameters and distributions to sample from
parameters = {'column__key': ('Sentence1','Sentence2'),
              'vect__max_df': (1.0, 0.2),
              'vect__min_df': (1, 2),
              'vect__ngram_range': ((1, 2),(1, 3), (1, 4)),  # unigrams or bigrams
              
              'tfidf__use_idf': (True,),
              'tfidf__smooth_idf': (False,),
              'tfidf__sublinear_tf': (False,),
              'tfidf__norm': ('l2',),
              
              "clf__loss": ('hinge', 'log'),
              "clf__alpha": (1e-4, 1e-5, 5e-5, 1e-6, 5e-6, 1e-7),
              "clf__class_weight": ('auto', None),
              "clf__penalty": ('l1','l2','elasticnet')}


# run randomized search
grid = GridSearchCV(pipeline, parameters, scoring='accuracy', 
                   fit_params=None, n_jobs=-1, iid=True,
                   refit=True,cv=2, verbose=1)


grid.fit(pd_train,pd_train['Author'])

pd.DataFrame(grid.grid_scores_, columns=['params','score','cv']).to_pickle(folder +"/data/SGDhingelog_grid")

Fitting 2 folds for each of 1728 candidates, totalling 3456 fits


[Parallel(n_jobs=-1)]: Done   1 jobs       | elapsed:    7.8s
[Parallel(n_jobs=-1)]: Done  50 jobs       | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 200 jobs       | elapsed: 10.0min
[Parallel(n_jobs=-1)]: Done 450 jobs       | elapsed: 22.7min
[Parallel(n_jobs=-1)]: Done 800 jobs       | elapsed: 40.5min
[Parallel(n_jobs=-1)]: Done 1250 jobs       | elapsed: 63.6min
[Parallel(n_jobs=-1)]: Done 1800 jobs       | elapsed: 92.5min
[Parallel(n_jobs=-1)]: Done 2450 jobs       | elapsed: 125.9min
[Parallel(n_jobs=-1)]: Done 3200 jobs       | elapsed: 164.7min
[Parallel(n_jobs=-1)]: Done 3450 out of 3456 | elapsed: 177.7min remaining:   18.5s
[Parallel(n_jobs=-1)]: Done 3456 out of 3456 | elapsed: 177.9min finished


In [75]:
results = pd.DataFrame(grid.grid_scores_, columns=['params','score','cv']).sort('score', ascending=False)

In [76]:
results.values[:20]

array([[ {'vect__ngram_range': (1, 2), 'tfidf__smooth_idf': False, 'tfidf__sublinear_tf': False, 'vect__max_df': 0.2, 'clf__loss': 'hinge', 'tfidf__use_idf': True, 'vect__min_df': 1, 'tfidf__norm': 'l2', 'clf__penalty': 'l2', 'clf__class_weight': 'auto', 'column__key': 'Sentence1', 'clf__alpha': 1e-05},
        0.6880548689203774, array([ 0.69456242,  0.68154596])],
       [ {'vect__ngram_range': (1, 2), 'tfidf__smooth_idf': False, 'tfidf__sublinear_tf': False, 'vect__max_df': 0.2, 'clf__loss': 'hinge', 'tfidf__use_idf': True, 'vect__min_df': 1, 'tfidf__norm': 'l2', 'clf__penalty': 'l2', 'clf__class_weight': None, 'column__key': 'Sentence1', 'clf__alpha': 5e-05},
        0.6875674546530655, array([ 0.69498016,  0.6801532 ])],
       [ {'vect__ngram_range': (1, 2), 'tfidf__smooth_idf': False, 'tfidf__sublinear_tf': False, 'vect__max_df': 0.2, 'clf__loss': 'hinge', 'tfidf__use_idf': True, 'vect__min_df': 1, 'tfidf__norm': 'l2', 'clf__penalty': 'l2', 'clf__class_weight': 'auto', 'column__

In [220]:
submit = pd.DataFrame(index=None)
submit['Id']=pd_test['Id']
submit['Pred']=y_pred

In [222]:
submit.to_csv(folder+'/results/0.arda1.csv',sep=';',index=None)