In [1]:
import nltk

In [2]:
from nltk.tokenize import word_tokenize

In [3]:
import pandas as pd
# Example using IMDB dataset in CSV format
df = pd.read_csv('datasets/imdb.csv')

In [4]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


# Data Pre-processing

In [5]:
df['review'] = df['review'].str.replace(r'<.*?>', '')  # Remove HTML tags
df['review'] = df['review'].str.replace(r'<br />', '')  # Remove HTML tags
df['review'] = df['review'].str.replace(r'[^a-zA-Z\s]', '')  # Remove special characters


In [6]:
df['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. The filming tec...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [7]:
# converting to lower case

In [8]:
df['review'] = df['review'].str.lower()


In [9]:
# why
# for normalization purpose

In [10]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nirva\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
from nltk.tokenize import word_tokenize
df['review_tokens'] = df['review'].apply(word_tokenize)


In [14]:
df

Unnamed: 0,review,sentiment,review_tokens
0,one of the other reviewers has mentioned that ...,positive,"[one, of, the, other, reviewers, has, mentione..."
1,a wonderful little production. the filming tec...,positive,"[a, wonderful, little, production, ., the, fil..."
2,i thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,..."
3,basically there's a family where a little boy ...,negative,"[basically, there, 's, a, family, where, a, li..."
4,"petter mattei's ""love in the time of money"" is...",positive,"[petter, mattei, 's, ``, love, in, the, time, ..."
...,...,...,...
49995,i thought this movie did a down right good job...,positive,"[i, thought, this, movie, did, a, down, right,..."
49996,"bad plot, bad dialogue, bad acting, idiotic di...",negative,"[bad, plot, ,, bad, dialogue, ,, bad, acting, ..."
49997,i am a catholic taught in parochial elementary...,negative,"[i, am, a, catholic, taught, in, parochial, el..."
49998,i'm going to have to disagree with the previou...,negative,"[i, 'm, going, to, have, to, disagree, with, t..."


In [15]:
# removing the stop words

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nirva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
df['review_tokens'] = df['review_tokens'].apply(lambda x: [word for word in x if word not in stop_words])


In [18]:
df[['review', 'review_tokens']]

Unnamed: 0,review,review_tokens
0,one of the other reviewers has mentioned that ...,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,a wonderful little production. the filming tec...,"[wonderful, little, production, ., filming, te..."
2,i thought this was a wonderful way to spend ti...,"[thought, wonderful, way, spend, time, hot, su..."
3,basically there's a family where a little boy ...,"[basically, 's, family, little, boy, (, jake, ..."
4,"petter mattei's ""love in the time of money"" is...","[petter, mattei, 's, ``, love, time, money, ''..."
...,...,...
49995,i thought this movie did a down right good job...,"[thought, movie, right, good, job, ., n't, cre..."
49996,"bad plot, bad dialogue, bad acting, idiotic di...","[bad, plot, ,, bad, dialogue, ,, bad, acting, ..."
49997,i am a catholic taught in parochial elementary...,"[catholic, taught, parochial, elementary, scho..."
49998,i'm going to have to disagree with the previou...,"['m, going, disagree, previous, comment, side,..."


In [19]:
# applying lemmetization

In [20]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nirva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [21]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['review_tokens'] = df['review_tokens'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [22]:
df['review_tokens']

0        [one, reviewer, mentioned, watching, 1, oz, ep...
1        [wonderful, little, production, ., filming, te...
2        [thought, wonderful, way, spend, time, hot, su...
3        [basically, 's, family, little, boy, (, jake, ...
4        [petter, mattei, 's, ``, love, time, money, ''...
                               ...                        
49995    [thought, movie, right, good, job, ., n't, cre...
49996    [bad, plot, ,, bad, dialogue, ,, bad, acting, ...
49997    [catholic, taught, parochial, elementary, scho...
49998    ['m, going, disagree, previous, comment, side,...
49999    [one, expects, star, trek, movie, high, art, ,...
Name: review_tokens, Length: 50000, dtype: object

In [45]:
!uv add scikit-learn

[2mResolved [1m46 packages[0m [2min 680ms[0m[0m
[2mPrepared [1m3 packages[0m [2min 5.93s[0m[0m
[2mInstalled [1m3 packages[0m [2min 541ms[0m[0m
 [32m+[39m [1mscikit-learn[0m[2m==1.6.1[0m
 [32m+[39m [1mscipy[0m[2m==1.15.1[0m
 [32m+[39m [1mthreadpoolctl[0m[2m==3.5.0[0m


In [24]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['review_tokens'].apply(lambda x: ' '.join(x)))


In [25]:
X_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 4880048 stored elements and shape (50000, 97587)>

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['review_tokens'].apply(lambda x: ' '.join(x)))

In [28]:
X

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 4880048 stored elements and shape (50000, 97587)>

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, df['sentiment'], test_size=0.2, random_state=42)

In [31]:
model = LogisticRegression()
model.fit(X_train, y_train)

In [32]:
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.8961


In [35]:
X_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 980378 stored elements and shape (10000, 97587)>

In [40]:
sample_vector = X_train[1].toarray()

In [41]:
sample_vector

array([[0., 0., 0., ..., 0., 0., 0.]], shape=(1, 97587))

In [43]:
y_train[1]

'positive'

In [44]:
model.predict(sample_vector)

array(['negative'], dtype=object)