### Day 1
Introduction, starters

In [14]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, ShuffleSplit
from sklearn.linear_model import LogisticRegression

In [15]:
def load_imdb(path, shuffle=True, random_state=42):
    import glob 
    print("Loading the imdb data")
    
    train_neg_files = glob.glob(path+"/train/neg/*.txt")
    train_pos_files = glob.glob(path+"/train/pos/*.txt")
    
    X_train_corpus = []
    y_train = []
    
    for tnf in train_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(0)
        f.close()
    
    for tpf in train_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        line = f.read()
        #line = line[:len(line)/2]
        X_train_corpus.append(line)
        y_train.append(1)
        f.close()
    
    print("Train Data loaded.")
    
    test_neg_files = glob.glob(path+"/test/neg/*.txt")
    test_pos_files = glob.glob(path+"/test/pos/*.txt")
    
    X_test_corpus = []
    y_test = []
    
    for tnf in test_neg_files:
        f = open(tnf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(0)
        f.close()
    
    for tpf in test_pos_files:
        f = open(tpf, 'r', encoding="utf8")
        X_test_corpus.append(f.read())
        y_test.append(1)
        f.close()
    
    print("Test Data loaded.")
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    
    if shuffle:
        np.random.seed(random_state)
        indices = np.random.permutation(len(y_train))       
        
        #X_train = X_train.tocsr()
        #X_train_corpus = X_train_corpus[indices]
        X_train_corpus = [X_train_corpus[i] for i in indices]
        y_train = y_train[indices]
        #train_corpus_shuffled = [train_corpus[i] for i in indices]
        
        indices = np.random.permutation(len(y_test))
        
        #X_test = X_test.tocsr()
        #X_test_corpus = X_test_corpus[indices]
        X_test_corpus = [X_test_corpus[i] for i in indices]
        y_test = y_test[indices]
        #test_corpus_shuffled = [test_corpus[i] for i in indices]
    #else:
        #train_corpus_shuffled = train_corpus
        #test_corpus_shuffled = test_corpus
    
    return X_train_corpus, y_train, X_test_corpus , y_test

### Load the imdb data

In [16]:
path = r"/Users/ekremguzelyel/Desktop/Assignments/Research/aclImdb"
X_train_corpus , y_train, X_test_corpus , y_test = load_imdb(path)

Loading the imdb data
Train Data loaded.
Test Data loaded.


### Vectorization

In [141]:
token = r"(?u)\b[\w\'/]+\b"
# Use corpus here. Dk y.
#mind
vectorizer = CountVectorizer(token_pattern=token, min_df=5, stop_words=["the","a","of","and","br","to"], binary=True)
X_train_vector = vectorizer.fit_transform(X_train_corpus)
X_test_vector = vectorizer.transform(X_test_corpus)

In [142]:
print(vectorizer.get_feature_names())



In [19]:
print("X_train_vector" + str(X_train_vector.toarray())) 
print("X_test_vector" + str(X_test_vector.toarray()))

X_train_vector[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
X_test_vector[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [20]:
## Solve
X_train_vector.shape

(25000, 28237)

In [21]:
vectorizer.vocabulary_

{'this': 25300,
 'movie': 16625,
 'is': 13412,
 'another': 1313,
 'christian': 4534,
 'propaganda': 19628,
 'film': 9565,
 'in': 12712,
 'line': 14783,
 'omega': 17604,
 'code': 4905,
 'not': 17276,
 'that': 25198,
 'necessarily': 16964,
 'bad': 2059,
 'but': 3636,
 'for': 9927,
 'fact': 9162,
 'most': 16560,
 'films': 9578,
 'sacrifice': 21615,
 'sincerity': 22847,
 'realism': 20266,
 'message': 16000,
 'they': 25263,
 'wish': 27774,
 'deliver': 6682,
 'if': 12486,
 'you': 28114,
 'enjoy': 8505,
 'styrofoam': 24294,
 'portrayal': 19127,
 'life': 14708,
 'on': 17620,
 'streets': 24152,
 'way': 27391,
 'gospel': 10891,
 'can': 3788,
 'change': 4240,
 'than': 25188,
 'perhaps': 18517,
 'may': 15735,
 'i': 12428,
 'say': 21843,
 'save': 21825,
 'your': 28126,
 'money': 16418,
 'rent': 20723,
 'cross': 6040,
 'switchblade': 24705,
 'or': 17711,
 'mission': 16281,
 'when': 27546,
 'will': 27681,
 'directors': 7174,
 'learn': 14506,
 'sometimes': 23368,
 'people': 18472,
 'words': 27882,
 'i

In [22]:
# TRY min_df and stop_words
def get_top_n_words(vector, vectorizer, n=None):
    """
    ## Taken from medium.com ##
    
    List the top n words in a vocabulary according to occurrence in a text corpus.
    
    get_top_n_words(["I love Python", "Python is a language programming", "Hello world", "I love the world"]) -> 
    [('python', 2),
     ('world', 2),
     ('love', 2),
     ('hello', 1),
     ('is', 1),
     ('programming', 1),
     ('the', 1),
     ('language', 1)]
    """
    sum_words = vector.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [23]:
get_top_n_words(X_train_vector, vectorizer, n=30)

[('is', 107318),
 ('in', 93970),
 ('it', 79159),
 ('i', 77247),
 ('this', 76004),
 ('that', 69826),
 ('was', 48187),
 ('as', 46935),
 ('for', 44344),
 ('with', 44129),
 ('movie', 43599),
 ('but', 42623),
 ('film', 39105),
 ('on', 34201),
 ('not', 30628),
 ('you', 29922),
 ('are', 29432),
 ('his', 29338),
 ('have', 27726),
 ('be', 26953),
 ('he', 26932),
 ('one', 26540),
 ('all', 23976),
 ('at', 23515),
 ('by', 22547),
 ('an', 21558),
 ('they', 21149),
 ('who', 20614),
 ('so', 20610),
 ('from', 20498)]

### Start training here 

In [143]:
lr = LogisticRegression()
lr.fit(X_train_vector, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [144]:
predictions = lr.predict(X_test_vector)

In [145]:
score = lr.score(X_test_vector, y_test)
print(score)

0.87316


In [146]:
score = lr.score(X_train_vector, y_train)
print(score)

0.99764


One by one

In [147]:
coefs = lr.coef_[0]
features = vectorizer.get_feature_names()

In [148]:
cofe = list(zip(features, coefs))

In [149]:
coefs.shape

(28237,)

In [150]:
cofe

[('0', -0.25940710348971946),
 ('0/10', -0.37902864011771453),
 ('00', -0.22918182347024274),
 ('000', -0.025315646020826633),
 ('007', -0.02572147226775229),
 ('00s', 0.13845964843046668),
 ('01', 0.09141891545368205),
 ('02', 0.035947331110027334),
 ('05', -0.06063849627372752),
 ('06', -0.3016090126174864),
 ('07', 0.03435321538862918),
 ('1', -0.8165581796163739),
 ('1/10', -1.773028382020648),
 ('1/2', -0.3915400746521696),
 ('1/3', -0.059256239543238054),
 ('1/4', 0.25937058291956194),
 ('1/5', -0.14549143115835367),
 ('10', -0.05739449259219131),
 ("10's", -0.1556710727882532),
 ('10/10', 1.1760005631191193),
 ('100', 0.12379929839157552),
 ('1000', 0.06048340096189787),
 ('100th', -0.1434854411479602),
 ('101', -0.05921181361176502),
 ('102', -0.03351815935991173),
 ('103', 0.1150585591556327),
 ('104', 0.08811631575893152),
 ('105', 0.0041668782622802),
 ('107', 0.006054593261679689),
 ('108', -0.21176165942533337),
 ('10s', 0.28534949032606877),
 ('10th', 0.033984883677308245

In [151]:
def get_top_n_coefs(vector, n=None):
    words_freq =sorted(cofe, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [152]:
get_top_n_coefs(cofe[::-1], 30)

[('7/10', 3.177890766667874),
 ('8/10', 1.893127889566014),
 ('refreshing', 1.5762645208255615),
 ('appreciated', 1.5270690687232276),
 ('excellent', 1.5045416011211958),
 ('7', 1.4659954500589691),
 ('hooked', 1.434411192007017),
 ('perfect', 1.3946349369805615),
 ('superb', 1.3491485762585986),
 ('wonderfully', 1.3102587357216458),
 ('9/10', 1.3012163764875047),
 ('rare', 1.2984954978464902),
 ('surprisingly', 1.2941153694741825),
 ('incredible', 1.257568198027642),
 ('underrated', 1.2502058863192242),
 ('funniest', 1.2278315816761551),
 ('vengeance', 1.2245508988519707),
 ('8', 1.2161431782465648),
 ('enjoyable', 1.213652095591985),
 ('perfectly', 1.2066210578028054),
 ('batman', 1.1847080380839576),
 ('10/10', 1.1760005631191193),
 ('noir', 1.174650573915332),
 ('delightful', 1.1728997673893242),
 ('scariest', 1.1673390887135189),
 ('captures', 1.1626183252858087),
 ('cerebral', 1.158598350097094),
 ('haunting', 1.087482998819802),
 ('carrey', 1.0818007354802648),
 ('subtitles', 1.

#### Merging All Coefficient Related Code


In [34]:
def get_top_coefficients_directly(classifier, vectorizer, n=None):
    ''' Finds top n coefficient and word tuple.
    Args:
        classifier, vectorizer: ..
        n: top n results
    Returns:
        A list of tuples with reverse ordered by coefficients.
        i.e [(word, coeff), (...)]
    '''
    coefficients = lr.coef_[0]
    features = vectorizer.get_feature_names()
    cofe = list(zip(features, coefficients))
    words_freq =sorted(cofe, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [35]:
get_top_coefficients_directly(lr, vectorizer, 30)

[('7/10', 2.802423412077471),
 ('8/10', 1.783732579769195),
 ('refreshing', 1.689246570417766),
 ('wonderfully', 1.4115029751058983),
 ('carrey', 1.401515525975687),
 ('appreciated', 1.3765233310743967),
 ('erotic', 1.3662819250637164),
 ('7', 1.3012294424243562),
 ('funniest', 1.2964855425560264),
 ('excellent', 1.2911528867756954),
 ('perfect', 1.2846122685917867),
 ('rare', 1.232429471384522),
 ('hooked', 1.2309365275456312),
 ('superb', 1.2277105781002402),
 ('surprisingly', 1.2134336326501718),
 ('vengeance', 1.2067243607309974),
 ('units', 1.164594109671708),
 ('9/10', 1.1539714197187676),
 ('8', 1.1519451427453966),
 ('underrated', 1.1462310593884077),
 ('delightful', 1.1405546918611815),
 ('flight', 1.1258034467340972),
 ('shannon', 1.1139693625047833),
 ('tears', 1.111043962039303),
 ('enjoyable', 1.1098266647737265),
 ('highly', 1.1054953138303332),
 ('flawless', 1.1036508634535935),
 ('squirrel', 1.0885197928630896),
 ('kitty', 1.0881214392001128),
 ('whoopi', 1.086995848292

-----------------

### Day 2
predict_proba()
- Returns a sparse array of probabilities (for both positive and negative) of each prediction.

In [153]:
probs = lr.predict_proba(X_train_vector)

In [154]:
probs

array([[8.96644309e-01, 1.03355691e-01],
       [1.46660451e-02, 9.85333955e-01],
       [9.68648826e-01, 3.13511737e-02],
       ...,
       [9.93014402e-01, 6.98559817e-03],
       [9.31645797e-04, 9.99068354e-01],
       [1.00729358e-09, 9.99999999e-01]])

In [155]:
positive_probs = [x[1] for x in probs] #list(zip(*probs))[1] # [x[1] for x in probs]
positive_probs

[0.10335569111386773,
 0.9853339548886646,
 0.03135117373549731,
 0.999993078722276,
 0.8241026220476313,
 0.01594830815189524,
 0.009359216888941873,
 0.9981955577447741,
 0.011624363227311252,
 0.003272386234744794,
 0.999533676436244,
 0.02486375054418936,
 0.2752502605929267,
 0.9338285668461157,
 0.07889056092208019,
 0.9859292399864328,
 0.9979461614940986,
 0.9998010945672317,
 0.005225379271702367,
 0.004311239875887671,
 9.53812715737762e-09,
 0.0016483994799257884,
 3.818065763539063e-05,
 0.15462373691966438,
 0.04231056695317018,
 0.9996851953883542,
 0.12465406687596237,
 1.2168533542585367e-08,
 0.9817672038834043,
 0.13109915170290587,
 0.9960328182405087,
 0.9960738707975745,
 0.9997436935119749,
 2.519011379175371e-07,
 0.01901607050006191,
 0.9301237953887918,
 2.0263730035987437e-05,
 0.0004314175821566903,
 0.9989701364669298,
 0.9996081272839827,
 0.08033835037481393,
 0.03276255249107243,
 0.9854349980099298,
 0.9982237816124321,
 0.004179767709907771,
 0.02523146

In [156]:
abs(np.array(positive_probs)-0.5)

array([0.39664431, 0.48533395, 0.46864883, ..., 0.4930144 , 0.49906835,
       0.5       ])

In [157]:
word_prob = list(zip(X_train_vector.toarray(), positive_probs))

In [158]:
prob_sorted = sorted(word_prob, key = lambda x: x[1], reverse=True)

In [107]:
[vectorizer.get_feature_names()[x] for x in prob_sorted[0]]

TypeError: list indices must be integers or slices, not tuple

In [43]:
len(prob_sorted)

25000

In [44]:
type(X_train_corpus)

list

#### Most Confident Documents
Directly

In [159]:
def most_n_confident_docs(classifier, vector, corpus, n=None):
    '''Finds top n confident documents.
    Args:
        classifier: ..
        vector: preprocessed/vectorized sparse matrix of the data
        corpus: an array that contains corpus (all documents in one)
        n: number of documents to be returned
    '''
    probs = lr.predict_proba(vector)
    positive_probs = list(zip(*probs))[1]
    word_prob = list(zip(corpus, positive_probs))
    prob_sorted = sorted(word_prob, key = lambda x: x[1], reverse=True)
    return prob_sorted[:n]

In [160]:
most_n_confident_docs(lr, X_train_vector, X_train_corpus, 5)

[('By now you\'ve probably heard a bit about the new Disney dub of Miyazaki\'s classic film, Laputa: Castle In The Sky. During late summer of 1998, Disney released "Kiki\'s Delivery Service" on video which included a preview of the Laputa dub saying it was due out in "1999". It\'s obviously way past that year now, but the dub has been finally completed. And it\'s not "Laputa: Castle In The Sky", just "Castle In The Sky" for the dub, since Laputa is not such a nice word in Spanish (even though they use the word Laputa many times throughout the dub). You\'ve also probably heard that world renowned composer, Joe Hisaishi, who scored the movie originally, went back to rescore the excellent music with new arrangements. Laputa came out before My Neighbor Totoro and after Nausicaa of the Valley of the Wind, which began Studio Ghibli and it\'s long string of hits. And in my opinion, I think it\'s one of Miyazaki\'s best films with a powerful lesson tuckered inside this two hour and four minute

## Day 3

#### Most Uncertain Documents

In [161]:
def most_n_uncertain_docs(classifier, vector, corpus, n=None):
    '''Finds top n confident documents.
    Args:
        classifier: ..
        vector: preprocessed/vectorized sparse matrix of the data
        corpus: an array that contains corpus (all documents in one)
        n: number of documents to be returned
    '''
    probs = lr.predict_proba(vector)
    positive_probs = [x[1] for x in probs]
    word_prob = list(zip(corpus, abs(np.array(positive_probs)-0.5)))
    prob_sorted = sorted(word_prob, key = lambda x: x[1], reverse=False)
    return prob_sorted[:n]

##### Reasons why it's uncertain:
1- Some comments are mixed bc of quotation errors.

2- Sarcasm, or positive and negative words are used at the same time. (terrific, funny, long, smile, boring)

    3- They are mostly about telling the events rather than rating the movie.

In [162]:
most_n_uncertain_docs(lr, X_train_vector, X_train_corpus, 5)

[('Those engaging the movie camera so early in the century must have figured out some of its potential very early on. This is a good story of a playboy type who needs money and inadvertently sells his soul to Satan for a lot of money. Unfortunately, the soul is his double and he must confront him frequently, tearing his life apart. There are some wonderful scenes with people fading out and, of course, the scenes when the two are on the stage at the same time. The middle part is a bit dull, but the Faustian story is always in the minds of the viewer. One thing I have to mention is the general unattractiveness of the people in the movie. Also, they pretty much shied away from much action which would have at least given some life to the thing. I first was made aware of this movie about 25 years ago and have finally been able to see it. I was not disappointed.',
  0.0024264958418331872),
 ('I don\'t know where to start; the acting, the special effects and the writing are all about as bad a

In [163]:
lr.decision_function(X_train_vector)

array([-2.1604829 ,  4.20744566, -3.43065043, ..., -4.95689454,
        6.97762578, 20.71599882])

### NOTES TO MYSELF

In [164]:
a=[2,3,-4]
np.array(a)
#abs(np.array(a))

array([ 2,  3, -4])

In [165]:
from sklearn.naive_bayes import MultinomialNB

In [166]:
# multinomial naive base
# decision function
MultinomialNB.predict

<function sklearn.naive_bayes.BaseNB.predict>

In [167]:
# get the most used positive and negative words out of documents.

In [168]:
X_train_corpus[:5]
X_train_arr = X_train_vector.toarray()

In [170]:
X_train_arr[1]
vectorizer.get_feature_names()

['0',
 '0/10',
 '00',
 '000',
 '007',
 '00s',
 '01',
 '02',
 '05',
 '06',
 '07',
 '1',
 '1/10',
 '1/2',
 '1/3',
 '1/4',
 '1/5',
 '10',
 "10's",
 '10/10',
 '100',
 '1000',
 '100th',
 '101',
 '102',
 '103',
 '104',
 '105',
 '107',
 '108',
 '10s',
 '10th',
 '11',
 '110',
 '112',
 '116',
 '117',
 '11th',
 '12',
 '120',
 '12th',
 '13',
 '135',
 '13th',
 '14',
 '140',
 '14th',
 '15',
 '150',
 '15th',
 '16',
 '160',
 '16mm',
 '16s',
 '16th',
 '17',
 '17th',
 '18',
 '180',
 "1800's",
 '1800s',
 '1840',
 '1876',
 "1890's",
 '1890s',
 '1895',
 '1898',
 '18th',
 '19',
 '1900',
 "1900's",
 '1900s',
 '1902',
 '1909',
 '1912',
 '1913',
 '1914',
 '1915',
 '1916',
 '1917',
 '1918',
 '1919',
 '1920',
 "1920's",
 '1920s',
 '1921',
 '1922',
 '1924',
 '1925',
 '1926',
 '1927',
 '1928',
 '1929',
 '1930',
 "1930's",
 '1930s',
 '1931',
 '1932',
 '1933',
 '1934',
 '1935',
 '1936',
 '1937',
 '1938',
 '1939',
 '1940',
 "1940's",
 '1940s',
 '1941',
 '1942',
 '1943',
 '1944',
 '1945',
 '1946',
 '1947',
 '1948',
 

In [189]:
vector=[0,1,1,0,1,0]
names=[2,3,4,1,2,3]
#names[lst]
[names[i] for i in range(len(names)) if vector[i]]

[3, 4, 2]

In [172]:
X_train_vector.shape

(25000, 28237)

In [173]:
coefs

array([-0.2594071 , -0.37902864, -0.22918182, ..., -0.15328612,
        0.59275068, -0.04663372])

### Day 4
Found out that vectorizer has nonboolean values (frequencies of words). 
Changed to "binary=True".

In [175]:
X_train_vector.toarray()[10][1200:1500]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], d

In [201]:
coefficients = lr.coef_[0]
words = vectorizer.get_feature_names()
coef_words = list(zip(coefficients, words))

def most_effective_words(classifier, vectorizer, vector, doc_number):
    ''' Takes the document number and returns its most effective words.'''
    
    return sorted([coef_words[i] for i in range(len(coef_words)) if (X_train_vector[doc_number, i] != 0)], key = lambda x: x[0], reverse=True )

In [212]:
most_effective_words(lr, vectorizer, X_train_vector, idx)

[(1.5045416011211958, 'excellent'),
 (1.3946349369805615, 'perfect'),
 (1.2984954978464902, 'rare'),
 (1.2941153694741825, 'surprisingly'),
 (1.2278315816761551, 'funniest'),
 (1.2066210578028054, 'perfectly'),
 (1.1847080380839576, 'batman'),
 (1.0748013699055983, 'amazing'),
 (1.0582048581261814, 'highly'),
 (1.0367351726132388, 'wonderful'),
 (1.0237368542481282, 'enjoyed'),
 (0.9730062602923528, 'gem'),
 (0.9315178716754375, 'great'),
 (0.9044763708647656, 'packed'),
 (0.8749549389867519, 'fantastic'),
 (0.8543746101261548, 'flawless'),
 (0.8250827247832683, 'hilarious'),
 (0.8121395019426402, 'best'),
 (0.7974334072194627, 'beautifully'),
 (0.7877395431584499, 'job'),
 (0.7638684273117834, 'powerful'),
 (0.7542959004198639, 'episodes'),
 (0.7527568227847585, 'originally'),
 (0.7381035467116976, 'thanks'),
 (0.7162860617363035, 'release'),
 (0.710986241744064, 'ages'),
 (0.7063764099891865, 'ride'),
 (0.6973829465838759, 'glad'),
 (0.695527776815979, 'deeper'),
 (0.6948796037780672

In [203]:
X_train_corpus[10]

"I haven't seen much German comedy, but if this film is anything to go by, I'm compelled to see more! The simple but effective storyline takes two very different people on a trip from Germany to Italy after Eva, an unemployed mother of two, discovers that her artist husband is having an affair with the wife of a wealthy lawyer. I won't reveal anything further, but what results is a very funny series of events with the perfect conclusion. My interest in international cinema has expanded since I first saw this film. I recommend it to anyone (any adult... don't let the inclusion of the young children fool you into thinking it's a family film) who love comedy - even those unfamiliar with the language."

In [204]:
probs[10,:]

array([4.66323564e-04, 9.99533676e-01])

In [206]:
idx = np.argmax(probs[:,1])

In [208]:
probs[idx, :]

array([0., 1.])

In [211]:
np.argmin(np.sum(X_test_vector, axis=0)[0])

202

In [186]:
X_train_vector[10, :].toarray()[0]

array([0, 0, 0, ..., 0, 0, 0])

------------
# CLOSING REMARKS

IMDB Movie Reviews is used to train a Logistic Regression model to predict if a review is positive or negative.

The model predicted movie reviews that contains more words with better accuracy than short reviews. The reason behind is, because, it uses weights of each word, and more word means better score.

Looking at the frequency of top 100 words, words like "and", "the" is ommitted from vectorizer. Which increased the accuracy. Later on in order to find the most effective words in a document, "binary=True" is added to vectorizer. Despite this change, accuracy wasn't affected, in fact, it increased slightly (0.87), comparing to older one (0.86).

__Ekrem Guzelyel // ML-Lab@IIT__