In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [3]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/grigorijarhipov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/grigorijarhipov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
df = pd.read_csv('IMDB Dataset.csv')

In [5]:
df_imdb = df.iloc[:len(df) // 5]

In [6]:
df_imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [7]:
df_imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  object
dtypes: object(2)
memory usage: 156.4+ KB


In [8]:
df_imdb.isnull().sum()

review       0
sentiment    0
dtype: int64

In [9]:
df_imdb.duplicated().sum()

17

In [10]:
df_imdb[df_imdb.duplicated()]

Unnamed: 0,review,sentiment
3537,Quite what the producers of this appalling ada...,negative
3769,My favourite police series of all time turns t...,positive
4391,"Beautiful film, pure Cassavetes style. Gena Ro...",positive
6352,If you liked the Grinch movie... go watch that...,negative
6479,I want very much to believe that the above quo...,negative
6672,Sigh. I'm baffled when I see a short like this...,negative
7221,"I have always been a huge fan of ""Homicide: Li...",positive
7222,There are plenty of comments already posted sa...,negative
7425,"The movie was excellent, save for some of the ...",positive
7555,This movie has made me upset! When I think of ...,negative


In [11]:
df_imdb = df_imdb.drop_duplicates()

In [12]:
df_imdb

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [None]:
df_imdb['words_count'] = df_imdb['review'].apply(lambda x: len(x.split()))

In [15]:
stop_words = set(stopwords.words('english'))

In [16]:
def simple_preprocessing(text):

  processed_text = []
  for review in df_imdb['review']:
    # Lowercase the text
    review = review.lower()

    # Remove HTML tags
    review = re.sub(r'<.*?>', '', review)

    # Remove urls
    review = re.sub(r'http\S+', '', review)

    # Remove hashtags and @ symbols
    review = re.sub(r'#', '', review)
    review = re.sub(r'@', '', review)


    # Tokenize the text
    tokens = word_tokenize(review)

    # Remove punctuation
    tokens = [token for token in tokens if token.isalpha()]

    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]

    # Join the tokens back into a string
    processed_text.append(' '.join(tokens))

  return processed_text

In [17]:
preprocessed_df = df_imdb

In [None]:
preprocessed_df['review'] = simple_preprocessing(df_imdb)
preprocessed_df['words_count'] = preprocessed_df['review'].apply(lambda x: len(x.split()))

In [19]:
preprocessed_df

Unnamed: 0,review,sentiment,words_count
0,one reviewers mentioned watching oz episode ho...,positive,162
1,wonderful little production filming technique ...,positive,80
2,thought wonderful way spend time hot summer we...,positive,80
3,basically family little boy jake thinks zombie...,negative,60
4,petter mattei love time money visually stunnin...,positive,119
...,...,...,...
9995,fun entertaining movie wwii german spy julie a...,positive,86
9996,give break anyone say good hockey movie know m...,negative,102
9997,movie bad movie watching endless series bad ho...,negative,105
9998,movie probably made entertain middle school ea...,negative,63


In [20]:
def stemming(df):
  stemmer = PorterStemmer()
  df['review'] = df['review'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))
  return df

In [None]:
stemmed_df = stemming(preprocessed_df)

In [22]:
stemmed_df

Unnamed: 0,review,sentiment,words_count
0,one review mention watch oz episod hook right ...,positive,162
1,wonder littl product film techniqu fashion giv...,positive,80
2,thought wonder way spend time hot summer weeke...,positive,80
3,basic famili littl boy jake think zombi closet...,negative,60
4,petter mattei love time money visual stun film...,positive,119
...,...,...,...
9995,fun entertain movi wwii german spi juli andrew...,positive,86
9996,give break anyon say good hockey movi know mov...,negative,102
9997,movi bad movi watch endless seri bad horror mo...,negative,105
9998,movi probabl made entertain middl school earli...,negative,63


In [None]:
stemmed_df['sentiment'] = stemmed_df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [24]:
stemmed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9983 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   review       9983 non-null   object
 1   sentiment    9983 non-null   int64 
 2   words_count  9983 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 312.0+ KB


In [25]:
X = stemmed_df['review']
y = stemmed_df['sentiment']

In [26]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)
X_tfidf.shape

(9983, 34436)

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42)

In [28]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(6988, 34436)
(6988,)
(2995, 34436)
(2995,)


In [46]:
# Instantiate the model
logreg = LogisticRegression()

# Train the model
logreg.fit(X_train, y_train)

# Predict on the test data.
y_pred = logreg.predict(X_test)

# Create an accuracy variable to check the model
accuracy = logreg.score(X_test, y_test)

# Print the accuracy
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.87


# What is the accuracy score? Is that a good score? How you can check further?

The accuracy score is 0.87. This is a good score, but it is not perfect. To check further, you can look at the classification report and the confusion matrix.

The classification report shows the precision, recall, and F1 score for each class. The precision is the proportion of predicted positives that were actually positive. The recall is the proportion of actual positives that were correctly predicted. The F1 score is the harmonic mean of the precision and recall.

The confusion matrix shows the number of true positives, true negatives, false positives, and false negatives.



In [47]:
# Evaluate the model.
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))

Accuracy: 0.8737896494156928
              precision    recall  f1-score   support

           0       0.89      0.85      0.87      1488
           1       0.86      0.90      0.88      1507

    accuracy                           0.87      2995
   macro avg       0.87      0.87      0.87      2995
weighted avg       0.87      0.87      0.87      2995

[[1261  227]
 [ 151 1356]]


In [48]:
reviews = ["I loved this movie!", "This movie was a bad comedy movie!"]

# Convert the reviews to TF-IDF vectors
X_tfidf = vectorizer.transform(reviews)

# Predict the sentiment of the reviews
y_pred = logreg.predict(X_tfidf)

# Print the predicted sentiment
for review, sentiment in zip(reviews, y_pred):
  if sentiment == 1:
    print(f"{review}: positive")
  else:
    print(f"{review}: negative")


I loved this movie!: positive
This movie was a bad comedy movie!: negative


In [49]:
def predict_sent(review):

  # Convert the review to TF-IDF vectors
  X_tfidf = vectorizer.transform([review])

  # Predict the sentiment of the review
  prediction = logreg.predict(X_tfidf)[0]

  # Print the predicted sentiment
  if prediction == 1:
    return "The review is positive"
  else:
    return "The review is negative"




In [50]:
pos_review = 'I have only ever seen this film once, I only ever want to see this film once and I will only ever need to see this film once. It is etched on my mind. I, like many others, left in silence. I could not imagine inventing a critical analysis of this film, picking small points of detail or of style, or even scoring points off the Director. It stands alone as a monumental piece of cinema, a magnificent accomplishment.'
predict_sent(pos_review)

'The review is positive'

In [51]:
neg_review = "This is not a movie, it's just a shame! The worst movie I have ever seen. This is simply a fraud. This crap just does not make any sense. There have been some bad films made but this is a travesty. It is hard to believe that money was spent to make this. Do not watch or buy this trash."

predict_sent(neg_review)

'The review is negative'