In [34]:
import nltk
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

from sklearn.metrics import classification_report, confusion_matrix


In [2]:
train = pd.read_csv('train.tsv', delimiter='\t')
test = pd.read_csv('test.tsv', delimiter='\t')

In [3]:
train.shape, test.shape

((156060, 4), (66292, 3))

In [4]:
train.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [5]:
test.head()

Unnamed: 0,PhraseId,SentenceId,Phrase
0,156061,8545,An intermittently pleasing but mostly routine ...
1,156062,8545,An intermittently pleasing but mostly routine ...
2,156063,8545,An
3,156064,8545,intermittently pleasing but mostly routine effort
4,156065,8545,intermittently pleasing but mostly routine


In [6]:
train.Sentiment.unique()

array([1, 2, 3, 4, 0], dtype=int64)

In [7]:

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156060 entries, 0 to 156059
Data columns (total 4 columns):
PhraseId      156060 non-null int64
SentenceId    156060 non-null int64
Phrase        156060 non-null object
Sentiment     156060 non-null int64
dtypes: int64(3), object(1)
memory usage: 4.8+ MB


In [8]:
train.Sentiment.value_counts()

2    79582
3    32927
1    27273
4     9206
0     7072
Name: Sentiment, dtype: int64

In [9]:
train.Sentiment.value_counts() / train.Sentiment.count()

2    0.509945
3    0.210989
1    0.174760
4    0.058990
0    0.045316
Name: Sentiment, dtype: float64

In [10]:
X_train = train['Phrase']
y_train = train['Sentiment']

In [11]:
# Convert a collection of text documents to a matrix of token counts
count_vect = CountVectorizer() 

# Fit followed by Transform
# Learn the vocabulary dictionary and return term-document matrix
X_train_counts = count_vect.fit_transform(X_train)

In [12]:
#X_train_count = X_train_count.toarray()
# 156060 rows of train data & 15240 features (one for each vocabulary word)
X_train_counts.shape

(156060, 15240)

In [13]:
# get all words in the vocabulary
vocab = count_vect.get_feature_names()
print (vocab)



In [14]:
# get index of any word
count_vect.vocabulary_.get(u'100')

2

In [18]:
# Sum up the counts of each vocabulary word
dist = np.sum(X_train_counts, axis=0)
# print (dist) # matrix

dist = np.squeeze(np.asarray(dist))
print (dist) # array

zipped = sorted(zip(vocab, dist))
zipped.sort(key = lambda t: t[1], reverse=True) # sort words by highest number of occurrence



# For each, print the vocabulary word and the number of times it 
# appears in the training set
for tag, count in zipped:
    print (count, tag)

[ 23 179  70 ...,  15   9   5]
51633 the
32702 of
32177 and
22761 to
13997 in
13476 is
12338 that
11734 it
8651 as
7750 with
7553 for
7051 its
6733 film
6502 an
6241 movie
5677 this
5126 but
5053 be
4893 on
4855 you
3990 by
3895 more
3827 his
3784 one
3682 about
3668 not
3593 at
3511 or
3495 than
3477 from
3401 all
3190 like
3134 have
3067 are
2969 has
2866 so
2785 out
2539 story
2438 rrb
2423 up
2373 who
2261 good
2243 too
2227 most
2172 into
2098 lrb
1999 if
1926 what
1919 time
1899 their
1888 no
1882 characters
1872 much
1813 comedy
1781 your
1769 can
1728 just
1718 life
1713 some
1674 does
1624 even
1605 little
1583 funny
1567 will
1535 well
1529 way
1471 very
1461 been
1456 any
1414 make
1393 only
1360 which
1358 he
1345 movies
1338 director
1324 love
1313 do
1305 bad
1305 new
1284 there
1272 work
1253 enough
1246 her
1235 was
1218 us
1217 own
1211 they
1198 made
1180 old
1170 action
1168 two
1160 other
1160 something
1151 would
1123 best
1115 through
1114 never
1113 we
1094 many


In [20]:
tf_transformer= TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)

In [21]:
X_train_tf.shape

(156060, 15240)

In [27]:
print(X_train_tf[1:2])

  (0, 5837)	0.258198889747
  (0, 5323)	0.258198889747
  (0, 5821)	0.258198889747
  (0, 7217)	0.258198889747
  (0, 14871)	0.258198889747
  (0, 13503)	0.258198889747
  (0, 288)	0.258198889747
  (0, 13505)	0.516397779494
  (0, 3490)	0.258198889747
  (0, 4577)	0.258198889747
  (0, 9227)	0.258198889747
  (0, 11837)	0.258198889747


In [28]:
text_clf = Pipeline([
    ('vect', CountVectorizer(stop_words='english', max_features=5000)),
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression())
])

text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_train)
np.mean(predicted == y_train)

0.64022811739074714