In [4]:
import sys
import numpy
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from pprint import pprint

In [5]:
##load imdb movie review dataset
filename = 'data/imdb_labelled.txt'
f = open(filename, 'rU')
sentences = f.readlines()

In [6]:
# there are 1000 lines in the imdb dataset
print len(sentences)

1000


In [7]:
# check the first 3 lines.
# format: review \t sentiment label \n
# 0:negative 1:positive
pprint (sentences[:3])

['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  \t0\n',
 'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  \t0\n',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  \t0\n']


In [8]:
# process the dateset, extract reviews and lables separately.
process_sentences = []
sentiment = []
for longStr in sentences:
    sentiment.append(int(longStr[-2]))
    process_sentences.append(longStr[:-3])

In [9]:
## check the first 3 reviews
pprint(process_sentences[:3])

['A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  ',
 'Not sure who was more lost - the flat characters or the audience, nearly half of whom walked out.  ',
 'Attempting artiness with black & white and clever camera angles, the movie disappointed - became even more ridiculous - as the acting was poor and the plot and lines almost non-existent.  ']


In [10]:
# check the first 3 sentiment labels respectively
# 0:negative  1:positive
pprint(sentiment[:3])

[0, 0, 0]


In [11]:
# star training, using bigram
bigram_vectorizer = CountVectorizer(ngram_range=(1,2),token_pattern = r'\b\w+\b', stop_words='english')
vectors = bigram_vectorizer.fit_transform(process_sentences).toarray()

In [18]:
# Countvectorizer will buil a vocabulary
# the length of this vocabulry is 8472
print len(bigram_vectorizer.vocabulary_)
pprint (bigram_vectorizer.vocabulary_)

8472
{u'0': 0,
 u'0 10': 1,
 u'1': 2,
 u'1 10': 3,
 u'1 8': 4,
 u'1 awful': 5,
 u'1 hour': 6,
 u'10': 7,
 u'10 1': 8,
 u'10 10': 9,
 u'10 feet': 10,
 u'10 grade': 11,
 u'10 movie': 12,
 u'10 oy': 13,
 u'10 plus': 14,
 u'10 scale': 15,
 u'10 score': 16,
 u'10 setting': 17,
 u'10 simply': 18,
 u'10 stars': 19,
 u'10 years': 20,
 u'12': 21,
 u'12 years': 22,
 u'13': 23,
 u'13 film': 24,
 u'13 rating': 25,
 u'15': 26,
 u'15 minutes': 27,
 u'15pm': 28,
 u'15pm fast': 29,
 u'17': 30,
 u'17 movie': 31,
 u'18th': 32,
 u'18th century': 33,
 u'1928': 34,
 u'1947': 35,
 u'1947 masterpiece': 36,
 u'1948': 37,
 u'1948 quite': 38,
 u'1949': 39,
 u'1949 hollywood': 40,
 u'1971': 41,
 u'1971 format': 42,
 u'1973': 43,
 u'1973 stranger': 44,
 u'1980': 45,
 u'1980 s': 46,
 u'1986': 47,
 u'1986 version': 48,
 u'1995': 49,
 u'1995 monster': 50,
 u'1998': 51,
 u'1998 deep': 52,
 u'2': 53,
 u'2 hours': 54,
 u'2 phantasm': 55,
 u'2 quite': 56,
 u'20': 57,
 u'20 30': 58,
 u'20 cover': 59,
 u'20 years': 60,
 u

 u'boring pointless': 866,
 u'boring self': 867,
 u'borrowed': 868,
 u'borrowed earlier': 869,
 u'boss': 870,
 u'boss truly': 871,
 u'bother': 872,
 u'bother movie': 873,
 u'bothersome': 874,
 u'bothersome man': 875,
 u'bought': 876,
 u'bought 8pm': 877,
 u'bought ebay': 878,
 u'bought turned': 879,
 u'box': 880,
 u'box directorial': 881,
 u'boyfriend': 882,
 u'boyfriend occupied': 883,
 u'boyle': 884,
 u'brain': 885,
 u'brain attempt': 886,
 u'brain eating': 887,
 u'brain wasn': 888,
 u'brainsucking': 889,
 u'brainsucking movies': 890,
 u'brat': 891,
 u'brat babysitting': 892,
 u'breaking': 893,
 u'breeders': 894,
 u'breeders terrible': 895,
 u'brevity': 896,
 u'brevity really': 897,
 u'brian': 898,
 u'brian donlevy': 899,
 u'brian keith': 900,
 u'brief': 901,
 u'brief candle': 902,
 u'brief memorable': 903,
 u'brigand': 904,
 u'brigand candace': 905,
 u'bright': 906,
 u'bright spot': 907,
 u'brilliance': 908,
 u'brilliance depicts': 909,
 u'brilliant': 910,
 u'brilliant actor': 911,


 u'facial hair': 2453,
 u'facing': 2454,
 u'facing south': 2455,
 u'fact': 2456,
 u'fact based': 2457,
 u'fact film': 2458,
 u'fact joe': 2459,
 u'fact liked': 2460,
 u'fact memorized': 2461,
 u'fact overall': 2462,
 u'fact s': 2463,
 u'fact stinker': 2464,
 u'factory': 2465,
 u'factory ready': 2466,
 u'failed': 2467,
 u'failed convey': 2468,
 u'fails': 2469,
 u'fails create': 2470,
 u'fails levels': 2471,
 u'fails miserably': 2472,
 u'fair': 2473,
 u'fair critic': 2474,
 u'fairly': 2475,
 u'fairly accurate': 2476,
 u'faithful': 2477,
 u'faithful adaptation': 2478,
 u'fall': 2479,
 u'fall head': 2480,
 u'fall trap': 2481,
 u'falling': 2482,
 u'falling love': 2483,
 u'falls': 2484,
 u'falls mountain': 2485,
 u'falsely': 2486,
 u'falsely accused': 2487,
 u'falwell': 2488,
 u'fame': 2489,
 u'famed': 2490,
 u'famed gaudi': 2491,
 u'family': 2492,
 u'family identify': 2493,
 u'family movie': 2494,
 u'family movies': 2495,
 u'family relationships': 2496,
 u'family s': 2497,
 u'famous': 2498,

 u'joy': 3846,
 u'joy behold': 3847,
 u'joy heart': 3848,
 u'joy partaking': 3849,
 u'joy watch': 3850,
 u'joyce': 3851,
 u'joyce s': 3852,
 u'juano': 3853,
 u'juano hernandez': 3854,
 u'judge': 3855,
 u'judge old': 3856,
 u'judging': 3857,
 u'judging fumbling': 3858,
 u'judith': 3859,
 u'judith light': 3860,
 u'judo': 3861,
 u'judo rolls': 3862,
 u'julian': 3863,
 u'julian fellowes': 3864,
 u'june': 3865,
 u'june 20': 3866,
 u'june allison': 3867,
 u'junk': 3868,
 u'junkyard': 3869,
 u'junkyard dogs': 3870,
 u'junkyard scenes': 3871,
 u'just': 3872,
 u'just 3': 3873,
 u'just adorable': 3874,
 u'just ask': 3875,
 u'just avoid': 3876,
 u'just awful': 3877,
 u'just bad': 3878,
 u'just big': 3879,
 u'just blew': 3880,
 u'just consider': 3881,
 u'just cross': 3882,
 u'just cult': 3883,
 u'just day': 3884,
 u'just didn': 3885,
 u'just don': 3886,
 u'just dropped': 3887,
 u'just dull': 3888,
 u'just everybody': 3889,
 u'just explain': 3890,
 u'just fails': 3891,
 u'just fine': 3892,
 u'just 

 u'omit': 5134,
 u'omit watching': 5135,
 u'ones': 5136,
 u'open': 5137,
 u'open amazing': 5138,
 u'opened': 5139,
 u'opened eyes': 5140,
 u'opened mouth': 5141,
 u'opening': 5142,
 u'opening sequence': 5143,
 u'operas': 5144,
 u'operas intelligent': 5145,
 u'opinion': 5146,
 u'ordeal': 5147,
 u'ordeal begin': 5148,
 u'oriented': 5149,
 u'oriented teenagers': 5150,
 u'original': 5151,
 u'original body': 5152,
 u'original remotely': 5153,
 u'originality': 5154,
 u'originality freshness': 5155,
 u'origins': 5156,
 u'origins puppets': 5157,
 u'ortolani': 5158,
 u'ortolani particularly': 5159,
 u'oscar': 5160,
 u'oscar material': 5161,
 u'oscar shortlist': 5162,
 u'ought': 5163,
 u'ought thrilled': 5164,
 u'outlandish': 5165,
 u'outlandish array': 5166,
 u'outlets': 5167,
 u'outside': 5168,
 u'outside south': 5169,
 u'outward': 5170,
 u'outward tranquillity': 5171,
 u'overacting': 5172,
 u'overacting gets': 5173,
 u'overacting underacting': 5174,
 u'overall': 5175,
 u'overall delight': 517

 u'self respecting': 6524,
 u'self sacrifice': 6525,
 u'sells': 6526,
 u'sells let': 6527,
 u'semi': 6528,
 u'semi truck': 6529,
 u'senior': 6530,
 u'senior actors': 6531,
 u'sense': 6532,
 u'sense pitiful': 6533,
 u'sense things': 6534,
 u'senses': 6535,
 u'senses assaulted': 6536,
 u'senses imagination': 6537,
 u'sensibility': 6538,
 u'sensitivities': 6539,
 u'sensitivities treatments': 6540,
 u'sentiment': 6541,
 u'sentiment actually': 6542,
 u'sentiment heart': 6543,
 u'seperate': 6544,
 u'seperate dreams': 6545,
 u'sequel': 6546,
 u'sequel changing': 6547,
 u'sequels': 6548,
 u'sequels think': 6549,
 u'sequence': 6550,
 u'sequence gem': 6551,
 u'sequence space': 6552,
 u'sequences': 6553,
 u'sequences just': 6554,
 u'series': 6555,
 u'series 10': 6556,
 u'series amazing': 6557,
 u'series anne': 6558,
 u'series baaaaaad': 6559,
 u'seriously': 6560,
 u'seriously just': 6561,
 u'seriously s': 6562,
 u'served': 6563,
 u'served perfect': 6564,
 u'set': 6565,
 u'set designer': 6566,
 u'

 u'understated film': 7816,
 u'understatement': 7817,
 u'understatement black': 7818,
 u'understood': 7819,
 u'understood mexican': 7820,
 u'undertone': 7821,
 u'undertone fifties': 7822,
 u'underwater': 7823,
 u'underwater shots': 7824,
 u'undoubtedly': 7825,
 u'undoubtedly did': 7826,
 u'undoubtedly film': 7827,
 u'uneasy': 7828,
 u'uneasy bad': 7829,
 u'unemployed': 7830,
 u'unemployed having': 7831,
 u'unethical': 7832,
 u'unethical joke': 7833,
 u'unfaithful': 7834,
 u'unfaithful kind': 7835,
 u'unfolds': 7836,
 u'unfolds 18th': 7837,
 u'unfolds gradually': 7838,
 u'unforgettable': 7839,
 u'unforgettable characters': 7840,
 u'unfortunate': 7841,
 u'unfortunate life': 7842,
 u'unfortunately': 7843,
 u'unfortunately bad': 7844,
 u'unfortunately cover': 7845,
 u'unfortunately inexperience': 7846,
 u'unfortunately virtue': 7847,
 u'unfunny': 7848,
 u'unfunny generic': 7849,
 u'unfunny movie': 7850,
 u'unintentionally': 7851,
 u'unintentionally comical': 7852,
 u'uninteresting': 7853,


In [40]:
# there are 1000 vectors, corresponding to each line
# the length of each vector is equal to the length of vocabulary 8472
print vectors.shape

(1000, 8472)


In [45]:
# check the first 3 vectors
# only a few element of the vector is not 0, since the review sentnces are much smaller than 8472
pprint (vectors[:3])

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)


In [48]:
# normalize the vectors
vectors_normalized = preprocessing.normalize(vectors,norm = 'l2')

In [51]:
# split vectors and lables into training set and test set
train_feature_vectors = vectors_normalized[:600, :]
test_feature_vectors = vectors_normalized[600:, :]
train_sentiment = sentiment[:600]
test_sentiment = sentiment[600:]
print 'train and test set shapes', train_feature_vectors.shape, test_feature_vectors.shape

train and test set shapes (600, 8472) (400, 8472)


In [53]:
classifier = LinearSVC(C=1)
classifier.fit(train_feature_vectors, train_sentiment)
print 'i have trained my classifier to perform sentiment analysis'

i have trained my classifier to perform sentiment analysis


In [55]:
predicted_sentiment = classifier.predict(test_feature_vectors)
acc = accuracy_score(test_sentiment, predicted_sentiment)
print 'i have a test set accuracy of: ', acc

i have a test set accuracy of:  0.73


In [64]:
# Now we got a trained model with 73% accuracy.
# we can use it to classify some new sentnces
my_test_sentence = "I love this movie! It's sweet but with satirical humor"
my_test_vector = bigram_vectorizer.transform([my_test_sentence])
my_test_vector = preprocessing.normalize(my_test_vector,norm = 'l2')

In [65]:
# this sentence is a positive sentnce
# the model predict it to be class 1, which is correct!
# 0:negative   1:positive
classifier.predict(my_test_vector)

array([1])