In [19]:
import json
import random
# import sklearn
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [2]:
with open('Data/imdb_train.json') as f:
    data = json.load(f)
random.shuffle(data)
print('class label:', data[0]['class'])
print('text:', data[0]['text'])

class label: pos
text: Let me start by saying at the young age of 34 I was suddenly widowed. I was devastated as he was NOT sick--- he died unexpectedly basically of a coronary--- his carotids blew out-- he died behind our house. There was a lot of speculation from police, cause he fell on something and it bashed his head in. I was a suspect for murder until the autopsy came back.   My children were as traumatized as I was, so in love with a good father figure as he. I had three small children, no education, no financial support. I took it very, very hard.   Within two years my in-laws attacked me verbally, physically, emotionally and spiritually demanding I grieve not in front of the children, and put on masks and showed people what they wanted to see, not show them my pain during holidays... Nobody stood up for me and my choice to sit out one holiday, except of course, the grief therapist I was seeing that had advised me to follow my heart and soul. My in-laws didn't get it! It chang

In [3]:
# We need to gather the texts and labels into separate lists
texts = [one_example['text'] for one_example in data]
labels = [one_example['class'] for one_example in data]
print('This many texts', len(texts))
print('This many labels', len(labels))
print()
for label, text in list(zip(labels, texts))[:20]:
    print(label, text[:50]+'...')

This many texts 25000
This many labels 25000

pos Let me start by saying at the young age of 34 I wa...
pos I saw this film when it was originally released in...
neg This early role for Barbara Shelley(in fact,her fi...
pos A DOUBLE LIFE has developed a mystique among film ...
neg Grand Central Murder (1942) Dir: S. Sylvan Simon  ...
pos Anna (Ursula Andress) is brought in as an official...
neg Jean Rollin artistic nonsense about vampires, alie...
neg The Egyptian Movies has A Lot Of Filmes With High ...
pos I first saw this film during and International Fil...
neg The good thing about this that's at least fresh: A...
pos My definition of a great movie is if you want to c...
pos This movie is sort of a Carrie meets Heavy Metal. ...
pos I don't understand the low 5.7 rating on this film...
neg i saw this movie at the toronto film festival with...
neg Even if 99,99% of people that has seen this movie ...
neg French director Jean Rollin isn't exactly known fo...
neg I can't believe it tha

In [4]:
# Check what's the different between count and tfidf
vectorizer = TfidfVectorizer()

toy_data = [
    'Rust has great documentation, ',
    'a friendly compiler with useful error messages, ',
    'and top-notch tooling — an integrated package manager and build tool, ',
    'smart multi-editor support with auto-completion and type inspections, ',
    'an auto-formatter, and more.'
]

vectorizer.fit(toy_data)
print('Unique features:')
print(vectorizer.get_feature_names())
print()
print('Feature vectors (sparse format):')
print(vectorizer.transform(toy_data))

Unique features:
['an', 'and', 'auto', 'build', 'compiler', 'completion', 'documentation', 'editor', 'error', 'formatter', 'friendly', 'great', 'has', 'inspections', 'integrated', 'manager', 'messages', 'more', 'multi', 'notch', 'package', 'rust', 'smart', 'support', 'tool', 'tooling', 'top', 'type', 'useful', 'with']

Feature vectors (sparse format):
  (0, 21)	0.5
  (0, 12)	0.5
  (0, 11)	0.5
  (0, 6)	0.5
  (1, 29)	0.3393931489111758
  (1, 28)	0.4206690600631704
  (1, 16)	0.4206690600631704
  (1, 10)	0.4206690600631704
  (1, 8)	0.4206690600631704
  (1, 4)	0.4206690600631704
  (2, 26)	0.3094185760868625
  (2, 25)	0.3094185760868625
  (2, 24)	0.3094185760868625
  (2, 20)	0.3094185760868625
  (2, 19)	0.3094185760868625
  (2, 15)	0.3094185760868625
  (2, 14)	0.3094185760868625
  (2, 3)	0.3094185760868625
  (2, 1)	0.41444245308083316
  (2, 0)	0.2496369589290994
  (3, 29)	0.27274066223567134
  (3, 27)	0.3380550208269348
  (3, 23)	0.3380550208269348
  (3, 22)	0.3380550208269348
  (3, 18)	0.33

Each features are assigned a idf score instead of frequency

In [10]:
vectorizer = TfidfVectorizer(max_features=100000, binary=False, ngram_range=(1,1))
feature_matrix = vectorizer.fit_transform(texts)
print('Feature matrix shape =',feature_matrix.shape)
# print('what did we get? ->', feature_matrix.__class__)

Feature matrix shape = (25000, 74849)


In [11]:
print(feature_matrix)

  (0, 51457)	0.04795948269660875
  (0, 67057)	0.05828874426187375
  (0, 27863)	0.019126160505820496
  (0, 73608)	0.03037829896863956
  (0, 70576)	0.039126420735074566
  (0, 61641)	0.028697483883389954
  (0, 48881)	0.0407498222468267
  (0, 24629)	0.020591401067233558
  (0, 38490)	0.04237831268558244
  (0, 12161)	0.04134737345928831
  (0, 1408)	0.03791460497341077
  (0, 38986)	0.036841561749160374
  (0, 35787)	0.012789468545365537
  (0, 65333)	0.04289867769951406
  (0, 50365)	0.029766017385358074
  (0, 74216)	0.023334519117508362
  (0, 26959)	0.028238203698170868
  (0, 62848)	0.02566885143203882
  (0, 41)	0.05042454051588808
  (0, 30923)	0.028521634844574706
  (0, 54079)	0.025762869793363345
  (0, 51454)	0.041142358035526086
  (0, 28663)	0.14330459040716995
  (0, 73827)	0.03220118565410733
  (0, 55262)	0.03421725222648945
  :	:
  (24999, 66474)	0.04436442968220257
  (24999, 72703)	0.02323894415788135
  (24999, 52709)	0.041748138871704844
  (24999, 68769)	0.03161098636688963
  (24999, 711

In [13]:
# With Validation
# train_text_set, test_texts, train_label_set, test_labels = train_test_split(
#     texts, labels,
#     test_size=0.2, random_state=4
# )
# train_texts, val_texts, train_labels, val_labels = train_test_split(
#     train_text_set, train_label_set,
#     test_size=0.2, random_state=4
# )

# Without Validation
train_texts, test_texts, train_labels, test_labels = train_test_split(
    texts, labels,
    test_size=0.2, random_state=4
)

vectorizer = TfidfVectorizer(
    max_features=100000,
    binary=True,
    ngram_range=(1,1)
)

feature_matrix_train = vectorizer.fit_transform(train_texts)
feature_matrix_test = vectorizer.transform(test_texts)

print(feature_matrix_train.shape)
print(feature_matrix_test.shape)

(20000, 68213)
(5000, 68213)


In [14]:
classifier = LinearSVC(
    C=0.0005,
    verbose=1
)

classifier.fit(
    feature_matrix_train,
    train_labels
)

[LibLinear]

LinearSVC(C=0.0005, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=1)

In [16]:
print('Test set score',classifier.score(feature_matrix_test, test_labels))
print('Train set score:',classifier.score(feature_matrix_train, train_labels))

Test set score 0.8368
Train set score: 0.84805


With the same parameters in the `vectorizer` and the `svm`, vectorizing the string by *idf* score probably gave the `svm` more information so it's less forgiving in classifying task. Next, I'll use `GridSearch` to alter `C`

In [22]:
pipe_tfidf = Pipeline(
    steps=[
        (
            'vectorize',
            CountVectorizer(
                max_features=100000,
                binary=True,
                ngram_range=(1,1)
            )
        ),
        (
            'classify',
            LinearSVC(
                verbose=1
            )
        )
    ]
)

In [21]:
grid_param_tfidf = {
    'vectorize__ngram_range': [
        (1, 1),
        (1, 2),
        (2, 2),
        (1, 3),
        (2, 3)
    ],
    'classify__C': np.logspace(
        -4, 4, 5
    )
}

In [23]:
grid_tfidf = GridSearchCV(
    pipe_tfidf,
    grid_param_tfidf,
    cv=5,
    n_jobs=4,
    verbose=1
)

In [24]:
grid_tfidf.fit(train_texts, train_labels)

Fitting 5 folds for each of 25 candidates, totalling 125 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done 125 out of 125 | elapsed: 17.3min finished


[LibLinear]

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('vectorize',
                                        CountVectorizer(analyzer='word',
                                                        binary=True,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=100000,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                      

In [25]:
print('The best combination is as follow:')
for param, value in grid_tfidf.best_params_.items():
    if 'vectorize' in param:
        print('tfidf {}: {}'.format(param[13:], value))
    else:
        print('SVC {}: {}'.format(param[12:], value))

The best combination is as follow:
SVC : 0.01
tfidf ram_range: (1, 2)


In [27]:
grid_tfidf.best_estimator_.score(test_texts, test_labels)

0.8932