In [33]:
import numpy as np
import pandas as pd

In [34]:
temp_df = pd.read_csv('IMDB Dataset.csv')

In [35]:
df = temp_df.iloc[:10000].copy()

In [36]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [37]:
df['review'][1]

'A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only "has got all the polari" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams\' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master\'s of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional \'dream\' techniques remains solid then disappears. It plays on our knowledge and our senses, particularly with the scenes concerning Orton and Halliwell and the sets (particularly of their flat with Halliwell\'s murals decorating every surface) are terribly well d

In [38]:
df['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,5028
negative,4972


In [39]:
df.isnull().sum()

Unnamed: 0,0
review,0
sentiment,0


In [40]:
df.duplicated().sum()

np.int64(17)

In [41]:
df.drop_duplicates(inplace=True)

In [42]:
df.duplicated().sum()

np.int64(0)

In [43]:
# Basic Preprocessing
# Remove tags
# lowercase
# remove stopwords

In [32]:
import re
def remove_tags(raw_text):
    cleaned_text = re.sub(re.compile('<.*?>'), '', raw_text)
    return cleaned_text

In [44]:
df['review'] = df['review'].apply(remove_tags)

In [14]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. The filming tec...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive
9996,Give me a break. How can anyone say that this ...,negative
9997,This movie is a bad movie. But after watching ...,negative
9998,This is a movie that was probably made to ente...,negative


In [45]:
df['review'] = df['review'].apply(lambda x:x.lower())

In [46]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [47]:
from nltk.corpus import stopwords

sw_list = stopwords.words('english')
df['review'] = df['review'].apply(lambda x: [item for item in x.split() if item not in sw_list])

In [48]:
df

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",positive
1,"[wonderful, little, production., filming, tech...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, there's, family, little, boy, (jak...",negative
4,"[petter, mattei's, ""love, time, money"", visual...",positive
...,...,...
9995,"[fun,, entertaining, movie, wwii, german, spy,...",positive
9996,"[give, break., anyone, say, ""good, hockey, mov...",negative
9997,"[movie, bad, movie., watching, endless, series...",negative
9998,"[movie, probably, made, entertain, middle, sch...",negative


In [50]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [51]:
X

Unnamed: 0,review
0,"[one, reviewers, mentioned, watching, 1, oz, e..."
1,"[wonderful, little, production., filming, tech..."
2,"[thought, wonderful, way, spend, time, hot, su..."
3,"[basically, there's, family, little, boy, (jak..."
4,"[petter, mattei's, ""love, time, money"", visual..."
...,...
9995,"[fun,, entertaining, movie, wwii, german, spy,..."
9996,"[give, break., anyone, say, ""good, hockey, mov..."
9997,"[movie, bad, movie., watching, endless, series..."
9998,"[movie, probably, made, entertain, middle, sch..."


In [52]:
# ohe
y

Unnamed: 0,sentiment
0,positive
1,positive
2,positive
3,negative
4,positive
...,...
9995,positive
9996,negative
9997,negative
9998,negative


In [60]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [55]:
# categorical to numerical transfer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X_train['review'] = X_train['review'].apply(lambda x: " ".join(x))
X_test['review'] = X_test['review'].apply(lambda x: " ".join(x))

X_train_bow = cv.fit_transform(X_train['review']).toarray()
X_test_bow = cv.transform(X_test['review']).toarray()

In [61]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

In [57]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [62]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Initialize and train the Logistic Regression model
model = LogisticRegression()
model.fit(X_train_bow, y_train_encoded)

# Make predictions on the test set
y_pred_encoded = model.predict(X_test_bow)

# Evaluate the model
accuracy = accuracy_score(y_test_encoded, y_pred_encoded)
report = classification_report(y_test_encoded, y_pred_encoded)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.871807711567351
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.85      0.86       952
           1       0.87      0.89      0.88      1045

    accuracy                           0.87      1997
   macro avg       0.87      0.87      0.87      1997
weighted avg       0.87      0.87      0.87      1997



In [69]:
# Function to preprocess a new review
def preprocess_new_review(review):
    # Remove tags
    review = remove_tags(review)
    # Lowercase
    review = review.lower()
    # Remove stopwords
    review = " ".join([item for item in review.split() if item not in sw_list])
    return review

# Sample new review
new_review = "This movie was absolutely fantastic! I loved every minute of it."

# Preprocess the new review
preprocessed_review = preprocess_new_review(new_review)

# Vectorize the preprocessed review using the fitted CountVectorizer
vectorized_review = cv.transform([preprocessed_review])

# Predict the sentiment
predicted_sentiment = model.predict(vectorized_review)

# Decode the prediction using the fitted encoder
predicted_label = encoder.inverse_transform(predicted_sentiment)

print(f"New Review: {new_review}")
print(f"Predicted Sentiment: {'Positive' if predicted_label[0] == 1 else 'Negative'}")

New Review: This movie was absolutely fantastic! I loved every minute of it.
Predicted Sentiment: Positive
