# Building model based on SVM

In [1]:
import numpy as np
import pandas as pd
import os

#feature extraction modules
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import cross_val_score, train_test_split,StratifiedKFold, StratifiedShuffleSplit
from sklearn.base import BaseEstimator
from sklearn.pipeline import Pipeline, make_pipeline, make_union
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.grid_search import GridSearchCV
from lib_DSG import ColumnSelector, DenseTransformer


#confusion matrix
from sklearn.metrics import confusion_matrix

folder = os.getcwd() ; print folder

/home/arda/Documents/DSG


##### importing data

In [2]:
#pd_train = pd.io.pickle.read_pickle(folder + '/data_munged/pd_train_tagged')
#pd_test = pd.io.pickle.read_pickle(folder + '/data_munged/pd_test_tagged')

pd_train = pd.io.pickle.read_pickle(folder + '/data_munged/pd_train_munged')
pd_test = pd.io.pickle.read_pickle(folder + '/data_munged/pd_test_munged')

# First model

In [None]:
dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{1,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }
tfv = TfidfVectorizer(lowercase=False, stop_words=None, token_pattern=dico_pattern["match_word_punct"], 
                      ngram_range=(1, 2), max_df=1.0, min_df=2, max_features=None, 
                      vocabulary=None, binary=True, norm=u'l2', 
                      use_idf=True, smooth_idf=True, sublinear_tf=True)

clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, 
                   class_weight=None, random_state=None, solver='liblinear', max_iter=100,
                   multi_class='ovr', verbose=0)

pipeline = make_pipeline(ColumnSelector(key='Sentence'), tfv)

In [None]:
Y = pd_train['Author'].values
X = pipeline.fit_transform(pd_train) ; print X.shape
X_test = pipeline.transform(pd_test)

In [6]:
alphas = np.arange(0.1, 1.4, 0.2)
print alphas

[ 0.1  0.3  0.5  0.7  0.9  1.1  1.3]


In [7]:
results=[]
for i in alphas:
    clf.C = i
    
    skf = StratifiedKFold(Y, n_folds=10, indices=None, shuffle=True, random_state=None)
    scores_skf = cross_val_score(clf, X, Y,scoring='accuracy',cv=skf, n_jobs=-1)

    sss = StratifiedShuffleSplit(Y, 10, test_size=0.2, random_state=0)
    scores_sss = cross_val_score(clf, X, Y,scoring='accuracy',cv=sss, n_jobs=-1)

    print ("SLF: acc: %0.4f, std: %0.4f, SSS: acc: %0.4f, std: %0.4f, alpha: %s" %
           (scores_skf.mean(), scores_skf.std(), scores_sss.mean(), scores_sss.std(), i))

SLF: acc: 0.5972, std: 0.0061, SSS: acc: 0.5803, std: 0.0042, alpha: 0.1
SLF: acc: 0.7121, std: 0.0058, SSS: acc: 0.7035, std: 0.0052, alpha: 0.3
SLF: acc: 0.7421, std: 0.0035, SSS: acc: 0.7367, std: 0.0060, alpha: 0.5
SLF: acc: 0.7583, std: 0.0082, SSS: acc: 0.7533, std: 0.0065, alpha: 0.7
SLF: acc: 0.7677, std: 0.0090, SSS: acc: 0.7639, std: 0.0048, alpha: 0.9
SLF: acc: 0.7746, std: 0.0072, SSS: acc: 0.7706, std: 0.0051, alpha: 1.1
SLF: acc: 0.7786, std: 0.0072, SSS: acc: 0.7763, std: 0.0049, alpha: 1.3


In [67]:
clf.C = 0.7

for i in range(5):
    scores=[]
    confusion_mat = np.zeros([6,6],dtype=int)
    for cv in range(5):
        x_train, x_val, y_train, y_val = train_test_split(X,Y, test_size=0.2)

        clf.fit(x_train,y_train)

        scores.append(clf.score(x_val,y_val))
        confusion_mat+=confusion_matrix(clf.predict(x_val),y_val)
    print ("Accuracy: %0.4f (+/- %0.4f)" % (np.mean(scores), np.std(scores) * 2))
        
print clf.classes_
print confusion_mat

Accuracy: 0.8070 (+/- 0.0070)
Accuracy: 0.8062 (+/- 0.0099)
Accuracy: 0.8111 (+/- 0.0102)
Accuracy: 0.8074 (+/- 0.0107)
Accuracy: 0.8051 (+/- 0.0103)
['austen' 'doyle' 'poe' 'shakespeare' 'twain' 'wilde']
[[8558  367   31   57  215  559]
 [ 459 4121   80   62  375  508]
 [  13   31   89    5    9   13]
 [  85   60   14 1937  136   71]
 [ 220  412   17   87 5024  312]
 [ 611  497   12   44  236 3398]]


# Advanced model

In [8]:
dico_pattern={'match_lowercase_only':'\\b[a-z]+\\b',
              'match_word':'\\w{1,}',
              'match_word1': '(?u)\\b\\w+\\b',
              'match_word_punct': '\w+|[,.?!;]',
              'match_NNP': '\\b[A-Z][a-z]+\\b|\\b[A-Z]+\\b',
              'match_punct': "[,.?!;'-]"
             }

tfv_uni = TfidfVectorizer(lowercase=False, analyzer=u'word', stop_words=None, token_pattern=dico_pattern["match_lowercase_only"],
                ngram_range=(1, 1), max_df=1.0, min_df=2, max_features=None, vocabulary=None,
                binary=True, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

tfv_bi = TfidfVectorizer(lowercase=False, analyzer=u'word', stop_words=None, token_pattern=dico_pattern["match_word1"],
                ngram_range=(2, 2), max_df=1.0, min_df=2, max_features=None, vocabulary=None,
                binary=True, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

tfv_nnp = TfidfVectorizer(lowercase=False, analyzer=u'word', stop_words=None, token_pattern=dico_pattern["match_NNP"],
                ngram_range=(1, 1), max_df=1.0, min_df=2, max_features=None, vocabulary=None,
                binary=True, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

tfv_punctuation = TfidfVectorizer(lowercase=False, analyzer=u'word', stop_words=None, token_pattern=dico_pattern["match_punct"],
                ngram_range=(1, 1), max_df=1.0, min_df=2, max_features=None, vocabulary=None,
                binary=True, norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)

clf = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, 
                   class_weight=None, random_state=None, solver='liblinear', max_iter=100,
                   multi_class='ovr', verbose=0)

uni = make_pipeline(ColumnSelector(key='Sentence'), tfv_uni)
bi = make_pipeline(ColumnSelector(key='Sentence'), tfv_bi)
nnp = make_pipeline(ColumnSelector(key='Sentence'), tfv_nnp)
punctuation = make_pipeline(ColumnSelector(key='Sentence'), tfv_punctuation)


pipeline = make_union(uni, bi, nnp, punctuation)

In [9]:
Y = pd_train['Author'].values
X = pipeline.fit_transform(pd_train)
X_test = pipeline.transform(pd_test)

print X.shape, len(tfv_uni.vocabulary_), len(tfv_bi.vocabulary_), len(tfv_nnp.vocabulary_), len(tfv_punctuation.vocabulary_)

(28723, 66630) 11438 53360 1825 7


In [12]:
alphas = np.arange(0.1, 3, 0.5);
alphas = [1., 2.]
weights=[1]
print alphas

[1.0, 2.0]


In [13]:
for weight in weights:
    print weight
    
    pipeline.transformer_weights=[1.2, 1, 1., 1]
    
    tfv_uni.binary=True
    tfv_bi.binary=True
    tfv_nnp.binary=True
    tfv_punctuation.binary=True
    
    X = pipeline.fit_transform(pd_train)
    results=[]
    for i in alphas:
        clf.C = i
        #clf.class_weight='auto'
        
        skf = StratifiedKFold(Y, n_folds=10, indices=None, shuffle=True, random_state=None)
        scores_skf = cross_val_score(clf, X, Y,scoring='accuracy',cv=skf, n_jobs=-1)
        
        sss = StratifiedShuffleSplit(Y, 10, test_size=0.2, random_state=0)
        scores_sss = cross_val_score(clf, X, Y,scoring='accuracy',cv=sss, n_jobs=-1)

        print ("SLF: acc: %0.4f, std: %0.4f, SSS: acc: %0.4f, std: %0.4f, alpha: %s" %
               (scores_skf.mean(), scores_skf.std(), scores_sss.mean(), scores_sss.std(), i))

1
SLF: acc: 0.7941, std: 0.0050, SSS: acc: 0.7899, std: 0.0048, alpha: 1.0
SLF: acc: 0.8034, std: 0.0053, SSS: acc: 0.7981, std: 0.0049, alpha: 2.0


In [None]:
#SVM
#SLF: acc: 0.8114, std: 0.0073, SSS: acc: 0.8070, std: 0.0038, alpha: 1.2
#SLF: acc: 0.8114, std: 0.0040, SSS: acc: 0.8073, std: 0.0033, alpha: 0.8
#SLF: acc: 0.8120, std: 0.0078, SSS: acc: 0.8073, std: 0.0037, alpha: 0.7
#SLF: acc: 0.8082, std: 0.0062, SSS: acc: 0.8079, std: 0.0037, alpha: 0.9 ===> 87,54527%


#LOG
#SLF: acc: 0.8012, std: 0.0044, SSS: acc: 0.8020, std: 0.0013, alpha: 60 ===> 86,8367%

#best
#SVM Accuracy: 0.7701 (+/- 0.0734), alpha: 1.3  ===> 88,57660% 'a','you','he','him','to','in','on' |auto|(1,2)|
#SVM Accuracy: 0.7708 (+/- 0.0737), alpha: 1.9  ===> 88,43489%
#SVM Accuracy: 0.7717 (+/- 0.0737), alpha: 2.4  ===> 88,39553
#SVM Accuracy: 0.7709 (+/- 0.0764), alpha: 2.4  ===> 88,35616%
#SVM Accuracy: 0.7663 (+/- 0.0764), alpha: 0.8  ===> 88,32467 %

#best log Accuracy: 0.7683 (+/- 0.0798), alpha: 67

In [14]:
clf.C = 2.0
l = []
for i in range(5):

    scores=[]
    confusion_mat = np.zeros([6,6],dtype=int)
    for cv in range(5):
        x_train, x_val, y_train, y_val = train_test_split(X,Y, test_size=0.2)
        
        clf.fit(x_train,y_train)
        
        scores.append(clf.score(x_val,y_val))
        confusion_mat+=confusion_matrix(clf.predict(x_val),y_val)
    l.append(np.mean(scores))
    print ("Accuracy: %0.4f (+/- %0.4f), alpha: %s" % (np.mean(scores), np.std(scores) * 2, i))
    
print clf.classes_
print confusion_mat
print ("Accuracy: %0.4f (+/- %0.4f)" % (np.mean(l), np.std(scores) * 2))

Accuracy: 0.7995 (+/- 0.0054), alpha: 0
Accuracy: 0.7961 (+/- 0.0066), alpha: 1
Accuracy: 0.7958 (+/- 0.0057), alpha: 2
Accuracy: 0.7959 (+/- 0.0108), alpha: 3
Accuracy: 0.7964 (+/- 0.0093), alpha: 4
['austen' 'doyle' 'poe' 'shakespeare' 'twain' 'wilde']
[[8968  589   43  145  368  752]
 [ 377 3991  130  109  453  461]
 [   0    0   18    0    0    0]
 [  25   16    5 1753   60   18]
 [ 185  423   35  131 5029  315]
 [ 458  416   20   61  252 3119]]
Accuracy: 0.7968 (+/- 0.0093)



# Kaggle submit

In [80]:
clf.fit(X,Y)

LinearSVC(C=0.7, class_weight='auto', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty=u'l2', random_state=None, tol=0.0001,
     verbose=0)

In [81]:
y_pred = clf.predict(X_test)

In [82]:
submit = pd.DataFrame(index=None)
submit['Id']=pd_test['Id']
submit['Pred']=y_pred

In [83]:
submit.to_csv(folder+'/results/0.arda1.csv',sep=';',index=None)

In [84]:
y_pred

array(['wilde', 'doyle', 'austen', ..., 'doyle', 'twain', 'wilde'], dtype=object)