In [1]:
# !pip install wordcloud 
# git clone https://github.com/amueller/word_cloud.git  

In [2]:
import pandas as pd
import numpy as np

In [3]:
import nltk
from statistics import mean
from random import shuffle
import string

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
from utils import process_text

In [6]:
nltk.download([
    "names",
    "stopwords",
    "movie_reviews",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
])

[nltk_data] Downloading package names to /home/cameron/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cameron/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/cameron/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/cameron/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/cameron/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /home/cameron/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Movie Review Data

In [7]:
# positive movie reviews
positive_reviews = [
    (nltk.corpus.movie_reviews.raw(review), 1)
    for review in nltk.corpus.movie_reviews.fileids(categories=["pos"])
]

In [8]:
# negative movie reviews
negative_reviews = [
    (nltk.corpus.movie_reviews.raw(review), 0)
    for review in nltk.corpus.movie_reviews.fileids(categories=["neg"])
]

In [9]:
positive_reviews.extend(negative_reviews)
shuffle(positive_reviews)

In [10]:
reviews = [r[0] for r in positive_reviews]
ratings = [r[1] for r in positive_reviews]

In [11]:
df_reviews = pd.DataFrame({'raw_text': reviews, 'label': ratings})

In [12]:
df_reviews['clean_text'] = df_reviews['raw_text'].apply(lambda x: process_text(x))

In [13]:
df_r = df_reviews[['clean_text', 'label']]
df_r.head()

Unnamed: 0,clean_text,label
0,weighed tired plot lines spielberg reliance fo...,0
1,mulholland drive well cannes film festival see...,0
2,american werewolf london john landis groundbre...,0
3,battlefield earth worst film 2000 guarantee no...,0
4,fact good thriller action movie need violence ...,1


In [14]:
df_r.shape

(2000, 2)

### Food Reviews

In [15]:
# set of reviews from https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews?select=Reviews.csv
df_food = pd.read_csv('Reviews.csv')
# df_food.head()

In [16]:
df_food = df_food[df_food['Score'] != 3]

In [17]:
# recalibrate Score
df_food['label'] = df_food['Score'].apply(lambda x: 1 if x > 3 else 0)

In [18]:
df_food['clean_text'] = df_food['Text'].apply(lambda x: process_text(x))

In [19]:
df_f = df_food[['clean_text', 'label']]
df_f.head()

Unnamed: 0,clean_text,label
0,I bought several Vitality canned dog food prod...,1
1,Product arrived labeled Jumbo Salted Peanuts ....,0
2,This confection around centuries It light pill...,1
3,If looking secret ingredient Robitussin I beli...,0
4,Great taffy great price There wide assortment ...,1


In [20]:
df_f.shape

(525814, 2)

In [21]:
df = pd.concat([df_f, df_r])
df.sample(frac=1)
df.shape

(527814, 2)

## Prepare data

In [22]:
X = np.array(df['clean_text'])
y = np.array(df['label'])

In [23]:
split = int(len(X) * 0.8)

In [24]:
X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

## Prepare Classifer, Train and Test

In [25]:
vectorizer = TfidfVectorizer()
clf = LogisticRegression()

In [26]:
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [27]:
clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
clf.score(X_test, y_test)

0.9325426522550515

## Predict

In [29]:
# from https://www.imdb.com/title/tt0114558/reviews
# https://www.rottentomatoes.com/m/ballistic_ecks_vs_sever
Text = ["Probably one of the best big-budget sci-fi films to never reach a big audience. Written and produced by James Cameron and expertly directed by Kathryn Bigelow, this film is more noir than actual sci-fi - although the sci-fi elements are important. With a great cast and an amazing atmosphere throughout, this is one not to miss for fans of thrillers, film noir, sci-fi and especially the subgenre knwon as cyberbunk.",
       "An action film starring Antonio Banderas and Lucy Liu, Ballistic: Ecks Vs. Sever looks like a video-game promo, has a story that plays like the fifth episode of a struggling syndicated action show, and feels like a headache waiting to happen.",
        "If you must see Ecks vs. Sever, just go bang your head against the wall for an hour and a half. It would be the same effect.",
        "Worst idea ever. Could not leave fast enough"
       ]

In [30]:
text = vectorizer.transform(Text)

In [31]:
# Only the first was a positive review
clf.predict(text)

array([1, 1, 1, 0])