In [1]:
#mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
#change directory
import os
os.chdir("drive/My Drive/Machine Learning/Datasets/SentimentClassification")
#print out the current directory
!pwd

Mounted at /content/drive
/content/drive/My Drive/Machine Learning/Datasets/SentimentClassification


In [None]:
!pwd

/content/drive/My Drive/Machine Learning/Datasets/SentimentClassification


In [37]:
!pip install pyprind

Collecting pyprind
  Downloading PyPrind-2.11.3-py2.py3-none-any.whl (8.4 kB)
Installing collected packages: pyprind
Successfully installed pyprind-2.11.3


In [None]:
!ls "/content/drive/My Drive/Machine Learning/Datasets/SentimentClassification"

aclImdb  classifier.pkl  Martin_MovieReviewClassification.ipynb  stopwords.pkl	vectorizer.py


In [None]:
print(os.listdir('/content/drive/My Drive/Machine Learning/Datasets/SentimentClassification'))

['vectorizer.py', 'classifier.pkl', 'stopwords.pkl', 'Martin_MovieReviewClassification.ipynb', 'aclImdb']


In [None]:
import pyprind
import pandas as pd
import os

# change the 'basepath' to the directory of the unzipped movie dataset
basepath = '/content/drive/My Drive/Machine Learning/Datasets/SentimentClassification/aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
data_list = []  # Create an empty list to store data

for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            file_path = os.path.join(path, file)
            with open(file_path, 'r', encoding='utf-8') as infile:
                txt = infile.read()
            # Append data as dictionary to data_list
            data_list.append({'review': txt, 'sentiment': labels[l]})
            pbar.update()

# Concatenate data_list into a DataFrame
df = pd.concat([pd.DataFrame(data_list)])

# Rename columns
df.columns = ['review', 'sentiment']

# Print the first few rows of the DataFrame
print(df.head())

0% [##############################] 100% | ETA: 00:00:00

                                              review  sentiment
0  I went and saw this movie last night after bei...          1
1  Actor turned director Bill Paxton follows up h...          1
2  As a recreational golfer with some knowledge o...          1
3  I saw this film in a sneak preview, and it is ...          1
4  Bill Paxton has taken the true story of the 19...          1



Total time elapsed: 00:36:23


In [5]:
import numpy as np
import pandas as pd

df = pd.read_csv('/content/drive/My Drive/Machine Learning/Datasets/SentimentClassification/movie_data.csv', encoding='utf-8')

print(df.head())

# Now you can perform operations on df
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

# Reading back the shuffled data to confirm
df = pd.read_csv('movie_data.csv', encoding='utf-8')
print(df.head(3))

                                              review  sentiment
0  election is a chinese mob movie or triads in t...          1
1  i was just watching a forensic files marathon ...          0
2  police story is a stunning series of set piece...          1
3  dear readers the final battle between the rebe...          1
4  i have seen the perfect son about three times ...          1
                                              review  sentiment
0  at a saturday matinee in my home town i went w...          0
1  i love this movie it is the first film master ...          1
2  in the voice over which begins the film hughie...          1


In [6]:
df.shape

(50000, 2)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
docs = np.array(['The sun is shining',
                 'The weather is sweet',
                 'The sun is shining, the weather is sweet,'
                 'and one and one is two'])
bag = count.fit_transform(docs)

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer(use_idf=True,
                         norm='l2',
                         smooth_idf=True)
np.set_printoptions(precision=2)
print(tfidf.fit_transform(count.fit_transform(docs)) .toarray())

[[0.   0.43 0.   0.56 0.56 0.   0.43 0.   0.  ]
 [0.   0.43 0.   0.   0.   0.56 0.43 0.   0.56]
 [0.5  0.45 0.5  0.19 0.19 0.19 0.3  0.25 0.19]]


In [10]:
df.loc[0, 'review'][-50:]

' hippies only or if you re stoned i give this a 1 '

In [11]:
import re
def preprocessor(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)',
                           text)
    text = (re.sub(r'[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text

In [12]:
preprocessor(df.loc[0, 'review'][-50:])

preprocessor("</a>This :) is :( a test :-)!")

'this is a test :) :( :)'

In [13]:
df['review'] = df['review'].apply(preprocessor)

In [14]:
def tokenizer(text):
    return text.split()
tokenizer('runners like running and thus they run')

['runners', 'like', 'running', 'and', 'thus', 'they', 'run']

In [15]:
from nltk.stem.porter import PorterStemmer

porter = PorterStemmer()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]
tokenizer_porter('runners like running and thus they run')

['runner', 'like', 'run', 'and', 'thu', 'they', 'run']

In [16]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
stop = stopwords.words('english')
[w for w in tokenizer_porter('a runner likes' ' running and runs a lot')[-10:] if w not in stop]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['runner', 'like', 'run', 'run', 'lot']

In [26]:
df.head

In [18]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [22]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None)
# param_grid = [{'vect__ngram_range': [(1,1)],
#                 'vect__stop_words': [stop, None],
#                 'vect__tokenizer': [tokenizer, tokenizer_porter],
#                 'clf__penalty': ['l1', 'l2'],
#                 'clf__C': [1.0, 10.0, 100.0]},
#                 {'vect__ngram_range': [(1,1)],
#                 'vect__stop_words': [stop, None],
#                 'vect__tokenizer': [tokenizer, tokenizer_porter],
#                 'vect__use_idf':[False],
#                 'vect__norm':[None],
#                 'clf__penalty': ['l1', 'l2'],
#                 'clf__C': [1.0, 10.0, 100.0]}
#             ]
# lr_tfidf = Pipeline([('vect', tfidf), ('clf', LogisticRegression(random_state=0, solver='liblinear'))])
# gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=5, verbose=2, n_jobs=-1)
# gs_lr_tfidf.fit(X_train, y_train)

tfidf = TfidfVectorizer(strip_accents=None, lowercase=False)
param_grid = {
    'vect__ngram_range': [(1, 1)],
    'clf__C': [1.0, 10.0]
}

lr_tfidf = Pipeline([
    ('vect', tfidf),
    ('clf', LogisticRegression(random_state=0, solver='liblinear'))
])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid, scoring='accuracy', cv=3, verbose=2)
gs_lr_tfidf.fit(X_train, y_train) #Previous code took over an hour and did not even finish, so made smaller

Fitting 3 folds for each of 2 candidates, totalling 6 fits
[CV] END ...............clf__C=1.0, vect__ngram_range=(1, 1); total time=   7.4s
[CV] END ...............clf__C=1.0, vect__ngram_range=(1, 1); total time=   7.0s
[CV] END ...............clf__C=1.0, vect__ngram_range=(1, 1); total time=   5.8s
[CV] END ..............clf__C=10.0, vect__ngram_range=(1, 1); total time=   8.7s
[CV] END ..............clf__C=10.0, vect__ngram_range=(1, 1); total time=   8.2s
[CV] END ..............clf__C=10.0, vect__ngram_range=(1, 1); total time=   7.4s


In [23]:
print('Best parameter set: %s ' % gs_lr_tfidf.best_params_)

Best parameter set: {'clf__C': 10.0, 'vect__ngram_range': (1, 1)} 


In [24]:
print('CV Accuracy: %.3f' % gs_lr_tfidf.best_score_)

CV Accuracy: 0.890


In [25]:
clf = gs_lr_tfidf.best_estimator_
print('Test Accuracy: %.3f' % clf.score(X_test, y_test))

Test Accuracy: 0.898


In [27]:
import numpy as np
import re
from nltk.corpus import stopwords

stop = stopwords.words('english')
def tokenizer(text):
    text = re.sub(r'<[^>]*>', '', text)
    emoticons = re.findall(r'(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub(r'[\W]+', ' ', text.lower()) \
                    + ' '.join(emoticons).replace('-', '')
    tokenized = [w for w in text.split() if w not in stop]
    return tokenized

In [30]:
def stream_docs(path):
    with open(path, 'r', encoding='utf-8') as csv:
        next(csv)  # skip header
        for line in csv:
            line = line.strip().split(',')
            text, label = ','.join(line[:-1]), int(line[-1])
            yield text, label

In [34]:
doc_stream = stream_docs(path='/content/drive/My Drive/Machine Learning/Datasets/SentimentClassification/movie_data.csv')
print(next(doc_stream))  # This should print the first (text, label) tuple correctly

('at a saturday matinee in my home town i went with an older friend he was about 12 and my mom let me go because she thought the film would be ok it s rated g i was assaulted by loud music strange images no plot and a stubborn refusal to make any sense we left halfway through because we were bored frustrated and our ears hurt i saw it 22 years later in a revival theatre my opinion had changed it s even worse basically everything i hated about it was still there and the film was very 60s and has dated badly i got all the little in jokes too bad they weren t funny the constant shifts in tone got quickly annoying and there s absolutely nothing to get a firm grip on some people will love this i found it frustrating by the end of the film i felt like throwing something heavy at the screen also all the monkees songs in this movie suck and i do like them for ex hippies only or if you re stoned i give this a 1 ', 0)


In [35]:
def get_minibatch(doc_stream, size):
    docs, y = [], []
    try:
        for _ in range(size):
            text, label = next(doc_stream)
            docs.append(text)
            y.append(label)
    except StopIteration:
        return None, None
    return docs, y

In [36]:
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier

vect = HashingVectorizer(decode_error='ignore', n_features=2**21, preprocessor=None, tokenizer=tokenizer)
clf = SGDClassifier(loss='log_loss', random_state=1)
doc_stream = stream_docs(path='movie_data.csv')

In [38]:
import pyprind
pbar = pyprind.ProgBar(45)
classes = np.array([0, 1])


for _ in range(45):
    X_train, y_train = get_minibatch(doc_stream, size=1000)
    if not X_train:
        break
    X_train = vect.transform(X_train)
    clf.partial_fit(X_train, y_train, classes=classes)
    pbar.update()

0% [##############################] 100% | ETA: 00:00:00
Total time elapsed: 00:00:52


In [39]:
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test, y_test))

Accuracy: 0.873


In [40]:
clf = clf.partial_fit(X_test, y_test)

In [41]:
import pandas as pd
df = pd.read_csv('movie_data.csv', encoding='utf-8')

In [42]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)
X = count.fit_transform(df['review'].values)

In [44]:
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, random_state=123, learning_method='batch')
X_topics = lda.fit_transform(X)

In [45]:
lda.components_.shape

(10, 5000)

In [46]:
n_top_words = 5
feature_names = count.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
        for i in topic.argsort()\
        [:-n_top_words - 1:-1]]))

Topic 1:
horror original comedy black house
Topic 2:
worst minutes guy script money
Topic 3:
book dvd read version original
Topic 4:
family performance beautiful father mother
Topic 5:
series episode tv comedy kids
Topic 6:
murder wife police john plays
Topic 7:
documentary camera audience war human
Topic 8:
music song songs musical role
Topic 9:
horror effects guy budget special
Topic 10:
action game war fight animation


In [47]:
horror = X_topics[:, 5].argsort()[::-1]
for iter_idx, movie_idx in enumerate(horror[:3]):
    print('\nHorror movie #%d:' % (iter_idx + 1))
    print(df['review'][movie_idx][:300], '...')


Horror movie #1:
 spoilers when undercover brooklyn north det eddie santos nestor serrano was to meet his drug supplier tito zapatti larry romano in the williamsburg section of brooklyn in a buy and bust operation with tito being the one who gets busted that things went haywire with both det santos and tito ending u ...

Horror movie #2:
 spoilers extremely brutal police drama set in san francisco involving a sting operation that goes terribly wrong a cop det falon sam elliott mistakenly and savagely beats to death an undercover policeman winch mike watson thinking that he murdered his partner det sam levinson mike burstyn a partner ...

Horror movie #3:
this first rate western tale of the gold rush brings great excitement romance and james stewart to the screen the far country is the only one out of all five stewart mann westerns that is often overlooked stewart yet again puts a new look on the ever present personalities he had in the five stewart  ...


In [None]:
n_docs = 10
for topic_idx in range(X_topics.shape[1]):
    print("\nTopic #{}:".format(topic_idx + 1))
    top_doc_indices = X_topics[:, topic_idx].argsort()[::-1][:n_docs]
    for doc_index in top_doc_indices:
        print(df['review'][doc_index][:300])  # Print the first 300 characters of each top document

In [50]:
import pickle
import os
dest = os.path.join('movieclassifier', 'pkl_objects')
if not os.path.exists(dest):
    os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest, 'stopwords.pkl'), 'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest, 'classifier.pkl'), 'wb'), protocol=4)

In [52]:
import sqlite3
import os
conn = sqlite3.connect('reviews.sqlite')

c = conn.cursor()
c.execute('DROP TABLE IF EXISTS review_db')
c.execute('CREATE TABLE review_db' ' (review TEXT, sentiment INTEGER, date TEXT)')
example1 = 'I love this movie'
c.execute("INSERT INTO review_db" " (review, sentiment, date) VALUES" " (?, ?, DATETIME('now'))", (example1, 1))
example2 = 'I disliked this movie'
c.execute("INSERT INTO review_db" " (review, sentiment, date) VALUES" " (?, ?, DATETIME('now'))", (example2, 0))
conn.commit()
conn.close()

In [53]:
conn = sqlite3.connect('reviews.sqlite')
c = conn.cursor()
c.execute("SELECT * FROM review_db WHERE date" " BETWEEN '2017-01-01 00:00:00' AND DATETIME('now')")
results = c.fetchall()
conn.close()
print(results)

[('I love this movie', 1, '2024-05-06 01:32:47'), ('I disliked this movie', 0, '2024-05-06 01:32:47')]
