In [2]:
import os
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedKFold

def make_Corpus(root_dir):
    polarity_dirs = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]    
    corpus = []    
    for polarity_dir in polarity_dirs:
        reviews = [os.path.join(polarity_dir,f) for f in os.listdir(polarity_dir)]
        for review in reviews:
            doc_string = "";
            with open(review) as rev:
                for line in rev:
                    doc_string = doc_string + line
            if not corpus:
                corpus = [doc_string]
            else:
                corpus.append(doc_string)
    return corpus

#Create a corpus with each document having one string
root_dir = 'txt_sentoken'
corpus = make_Corpus(root_dir)

#import pdb;pdb.set_trace()
#Stratified 10-cross fold validation with SVM and Multinomial NB 
labels = np.zeros(2000);
labels[0:1000]=0;
labels[1000:2000]=1; 
      
kf = StratifiedKFold(n_splits=10)

totalsvm = 0           # Accuracy measure on 2000 files
totalNB = 0
totalMatSvm = np.zeros((2,2));  # Confusion matrix on 2000 files
totalMatNB = np.zeros((2,2));

for train_index, test_index in kf.split(corpus,labels):
    X_train = [corpus[i] for i in train_index]
    X_test = [corpus[i] for i in test_index]
    y_train, y_test = labels[train_index], labels[test_index]
    vectorizer = TfidfVectorizer(min_df=5, max_df = 0.8, sublinear_tf=True, use_idf=True,stop_words='english')
    train_corpus_tf_idf = vectorizer.fit_transform(X_train) 
    test_corpus_tf_idf = vectorizer.transform(X_test)
    
    model1 = LinearSVC()
    model2 = MultinomialNB()    
    model1.fit(train_corpus_tf_idf,y_train)
    model2.fit(train_corpus_tf_idf,y_train)
    result1 = model1.predict(test_corpus_tf_idf)
    result2 = model2.predict(test_corpus_tf_idf)
    print(test_corpus_tf_idf)
    totalMatSvm = totalMatSvm + confusion_matrix(y_test, result1)
    totalMatNB = totalMatNB + confusion_matrix(y_test, result2)
    totalsvm = totalsvm + sum(y_test==result1)
    totalNB = totalNB + sum(y_test==result2)
    test_corpus = vectorizer.transform(["I do not liked the service.", "What is the point of starting this when you can not deliver items on time", "Is it really working in India ?"])
    result_test = model1.predict(test_corpus)
    print(result_test)
    
print(totalMatSvm, totalsvm/2000.0, totalMatNB, totalNB/2000.0)

  (0, 12152)	0.0340177863279
  (0, 12123)	0.0618570754465
  (0, 12108)	0.0780790155723
  (0, 12086)	0.0580331150829
  (0, 12036)	0.0727873059014
  (0, 11928)	0.077138027366
  (0, 11914)	0.0661527114945
  (0, 11874)	0.054308125789
  (0, 11871)	0.0543129398537
  (0, 11855)	0.0378256204361
  (0, 11816)	0.0385684201465
  (0, 11728)	0.0825442877545
  (0, 11678)	0.0486570408961
  (0, 11629)	0.0351048859616
  (0, 11507)	0.0977682797871
  (0, 11415)	0.0714655245702
  (0, 11345)	0.0746154813875
  (0, 11322)	0.0661527114945
  (0, 11304)	0.0387598103862
  (0, 11131)	0.0714655245702
  (0, 11101)	0.0559668076353
  (0, 11099)	0.0861703527624
  (0, 11011)	0.075411632912
  (0, 10976)	0.0585261928044
  (0, 10916)	0.0705431808727
  :	:
  (199, 1052)	0.0631975653378
  (199, 1047)	0.0334218097429
  (199, 1045)	0.0436105147437
  (199, 1034)	0.0631975653378
  (199, 1031)	0.0323541410765
  (199, 1030)	0.0360720522044
  (199, 896)	0.0379493764141
  (199, 852)	0.0637869100961
  (199, 811)	0.0752597105667
  (19

  (0, 12098)	0.0569820517869
  (0, 12068)	0.0585620965952
  (0, 11963)	0.0730441375334
  (0, 11848)	0.0944071540872
  (0, 11612)	0.0449578452763
  (0, 11560)	0.120015210004
  (0, 11486)	0.100573437234
  (0, 11437)	0.0551862668861
  (0, 11329)	0.0676528679503
  (0, 11166)	0.0814601516642
  (0, 11150)	0.0830193846393
  (0, 10830)	0.0904897281961
  (0, 10785)	0.0666838379664
  (0, 10682)	0.198235350837
  (0, 10596)	0.115618098608
  (0, 10498)	0.0824858380524
  (0, 10443)	0.109589858136
  (0, 10436)	0.183837395722
  (0, 10413)	0.105407475001
  (0, 10329)	0.0740480838072
  (0, 10118)	0.111725802853
  (0, 9859)	0.0732079560833
  (0, 9636)	0.114256177809
  (0, 9448)	0.0385078021641
  (0, 9266)	0.096433549344
  :	:
  (199, 1986)	0.0484619802911
  (199, 1776)	0.0238933964083
  (199, 1737)	0.0402893749515
  (199, 1698)	0.0928060773537
  (199, 1651)	0.06792369732
  (199, 1619)	0.0422197614987
  (199, 1523)	0.0570294528692
  (199, 1515)	0.2153832636
  (199, 1111)	0.0620406558803
  (199, 1090)	0.02