In [19]:
import requests
import os
def download_file(url):
    local_filename = url.split('/')[-1]
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(local_filename, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                f.write(chunk)
    return local_filename

In [20]:
download_file("http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz")

KeyboardInterrupt: 

In [9]:
import tarfile
with tarfile.open("aclImdb_v1.tar.gz","r:gz") as tar:
    tar.extractall("Datasets/")

In [10]:
os.remove("aclImdb_v1.tar.gz")

In [18]:
import os
import pandas as pd 
df = pd.DataFrame()
labels = {"pos":1,"neg":0}
paths = ("train","test")
labs = ("pos","neg")
for s in paths:
    for l in labs:
        path = os.path.join("Datasets/aclImdb",s,l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path,file),"r",encoding="utf8") as infile:
                text = infile.read()
            df = df.append([[text,labels[l]]],ignore_index=True)
df.columns = ['review','sentiment']
df.head()

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [19]:
import numpy as np 
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv("Datasets/movie_data.csv",index=False)

In [20]:
data = pd.read_csv("Datasets/movie_data.csv")
data.head()

Unnamed: 0,review,sentiment
0,"Often tagged as a comedy, The Man In The White...",1
1,After Chaplin made one of his best films: Doug...,0
2,I think the movie was one sided I watched it r...,0
3,I have fond memories of watching this visually...,1
4,This episode had potential. The basic premise ...,0


In [28]:
data.shape

(50000, 2)

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
import re
count = CountVectorizer()
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [30]:
preprocessor(data.loc[0,'review'][-50:])

' scenes in the beginning my vote 9 10'

In [22]:
data['review'] = data['review'].apply(preprocessor)

In [23]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
def tokenizer(text):
    return text.split()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [34]:
import nltk 
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\infra\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [24]:
from nltk.corpus import stopwords
stop = stopwords.words("english")
[w for w in tokenizer_porter("an example sentence for testing the stopwords") if w not in stop]

['exampl', 'sentenc', 'test', 'stopword']

In [37]:
X_train = data.loc[:25000,'review'].values
y_train = data.loc[:25000, 'sentiment'].values

X_test = data.loc[25000:,'review'].values
y_test = data.loc[25000:,'sentiment'].values

In [44]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                       lowercase=False,
                       preprocessor=None)
lr = LogisticRegression(random_state=0,solver="liblinear")
pipe_tfidf = Pipeline([("vect",tfidf),
                      ("clf",lr)])

param_grid = [{"vect__ngram_range":[(1,1)],
              "vect__stop_words":[stop,None],
              "vect__tokenizer":[tokenizer,tokenizer_porter],
              "clf__penalty":["l1","l2"], 
              "clf__C":[1.0,10.0,100.0]},
             {"vect__ngram_range":[(1,1)],
             "vect__stop_words":[stop,None],
             "vect__tokenizer":[tokenizer,tokenizer_porter],
             "vect__use_idf":[False],
             "vect__norm":[None],
             "clf__penalty":['l1','l2'],
             "clf__C":[1.0,10.0,100.0]}]
    
grid_tfidf = GridSearchCV(param_grid=param_grid,
                         estimator=pipe_tfidf,
                         scoring="accuracy",
                         verbose=2,
                         n_jobs=-1)

In [47]:
grid_tfidf.fit(X_train,y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [38]:
import numpy as np 
import re 
from nltk.corpus import stopwords

stop = stopwords.words("english")

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +\
        ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

def stream_docs(path):
    with open(path,"r",encoding="utf8") as csv:
        next(csv)
        for line in csv:
            text,label = line[:-3],int(line[-2])
            yield text,label

In [4]:
next(stream_docs(path="Datasets/movie_data.csv"))

('"This wonderful 3 part BBC production is one of the sweetest love stories that I have seen in a while. The actresses display a very high level of talent, especially Rachael Stirling as Nan Astley. She is funny, seductive and cute. The love making scenes and the close up kisses are very erotic regardless of one\'s sexual preference. <br /><br />The characters are well defined and very believable. I guess this is a by-product of a good adaptation from a well written novel.<br /><br />A truly remarkable well paced drama that picks up speed quickly after a couple of boring (but necessary) scenes in the beginning.<br /><br />My vote: 9/10"',
 1)

In [39]:
def get_minibatch(doc_stream,size):
    docs,y = [] , []
    try:
        for _ in range(size):
            text,label=next(doc_stream)
            y.append(text)
    except StopIteration:
        return None,None
    return docs,y

In [40]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error="ignore",
                        n_features=2,
                        preprocessor=None,
                        tokenizer=tokenizer)

In [41]:
clf = SGDClassifier(loss="log",random_state=1)
doc_stream = stream_docs(path="Datasets/movie_data.csv")

In [42]:
classes = np.array([0,1])
for _ in range(45):
    X_train,y_train = get_minibatch(doc_stream,size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train,y_train,classes=classes)

In [43]:
X_test,y_test = get_minibatch(doc_stream,size=5000)
X_test = vect.transform(X_test)
print("Accuracy: {}".format(clf.score(X_test,y_test)))

ValueError: Cannot vectorize empty sequence.

In [44]:
get_minibatch(doc_stream,size=5000)

([],
 ['"this movie I saw some 10 years ago (maybe more), I took it in a rental and never found it to buy even in French sites. The end is very surprising and intelligent. I would like very much to watch it again because I think it\'s as surpring as the Sixth Sense althogh a completely different kind of movie."',
  '"Considering how much money was budgeted for this film, you would expect more from the story as a whole. This could be quite possibly the most worthless movie I have ever watched. There was no real advancement of anything. Character development, minimal. Plot advancement, maybe. Enjoyment, none. I\'m not sure what points were even trying to be made. If you want to see a movie where terrorists are kinda good guys, American CIA bombs everything that doesn\'t agree with our opinions, all capitalists are corrupt, and you like to see anything resembling a storyboard advancement accompanied by a death, have at. For those of us who realize that it doesn\'t take killing off a good 