In [25]:
import pandas as pd


In [26]:
df_imdb = pd.read_csv(r'/Users/admin/Desktop/python/di-bootcamp/week8/day3/exercisexp/IMDB Dataset.csv')
df_imdb.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [27]:
df_imdb.shape

(50000, 2)

In [28]:
df_imdb.dtypes

review       object
sentiment    object
dtype: object

In [29]:
df_imdb.isnull().sum()

review       0
sentiment    0
dtype: int64

In [30]:
df_imdb = df_imdb.dropna()
df_imdb.isnull().sum()

review       0
sentiment    0
dtype: int64

Print the first 5 reviews and their sentiments classification

In [31]:
df_imdb[['review', 'sentiment']].head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


Create a function to count the number of words in each review

In [32]:
def count_words(review):
    return len(str(review).split())

Apply this function to the review column and add a new column called "words count"

In [33]:
df_imdb['words count'] = df_imdb['review'].apply(count_words)


Visualize the result in the DataFrame

In [34]:
df_imdb[['review', 'words count']].head()

Unnamed: 0,review,words count
0,One of the other reviewers has mentioned that ...,307
1,A wonderful little production. <br /><br />The...,162
2,I thought this was a wonderful way to spend ti...,166
3,Basically there's a family where a little boy ...,138
4,"Petter Mattei's ""Love in the Time of Money"" is...",230


In [35]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [36]:
import nltk
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [43]:
# Step 1: Create a function called simple_preprocessing
def simple_preprocessing(text):
    # Make the text lowercase
    text = text.lower()

    # Remove HTML br tags
    text = re.sub(r'<br\s*/?>', ' ', text)

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove hashtags and @ symbol
    text = re.sub(r'#|\@', '', text)

    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)

    # Tokenize the text using nltk
    words = nltk.word_tokenize(text)

    # Remove stopwords using nltk
    custom_stopwords = set(stopwords)
    words = [word for word in words if word.lower() not in custom_stopwords]

    # Return a string as the preprocessed text
    return ' '.join(words)

In [44]:
# Step 2: Apply the simple_preprocessing() function in the review column
df_imdb['review'] = df_imdb['review'].apply(simple_preprocessing)

In [45]:
# Step 3: Print the first 5 reviews after preprocessing
print("\nFirst 5 reviews after preprocessing:")
print(df_imdb['review'].head())


First 5 reviews after preprocessing:
0    one reviewers mentioned watching 1 oz episode ...
1    wonderful little production filming technique ...
2    thought wonderful way spend time hot summer we...
3    basically theres family little boy jake thinks...
4    petter matteis love time money visually stunni...
Name: review, dtype: object


In [46]:
# Step 4: Check and remove duplicated reviews
print("\nNumber of duplicated reviews before removal:", df_imdb.duplicated('review').sum())
df_imdb = df_imdb.drop_duplicates('review')


Number of duplicated reviews before removal: 425


In [48]:
# Check that duplicated reviews were deleted
df_imdb.duplicated('review').sum()

0

In [49]:
# Step 5: Create a function stemming() and use PorterStemmer to stem the reviews column
def stemming(text):
    stemmer = PorterStemmer()
    words = nltk.word_tokenize(text)
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)


In [50]:
# Apply the stemming function to the reviews column
df_imdb['review'] = df_imdb['review'].apply(stemming)

Preparing Data To Train The Model


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Step 1: Split the data into X and Y
X = df_imdb['review']
Y = df_imdb['sentiment']

# Binarize the sentiment column
Y = Y.map({'positive': 1, 'negative': 0})

# Step 2: Vectorize the data using TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_vectorized = tfidf_vectorizer.fit_transform(X)

# Step 3: Split into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X_vectorized, Y, test_size=0.3, random_state=42)

# Step 4: Print the shapes
print("Shapes of x_train, y_train, x_test, y_test:")
print("x_train shape:", x_train.shape)
print("y_train shape:", y_train.shape)
print("x_test shape:", x_test.shape)
print("y_test shape:", y_test.shape)

Shapes of x_train, y_train, x_test, y_test:
x_train shape: (34702, 128350)
y_train shape: (34702,)
x_test shape: (14873, 128350)
y_test shape: (14873,)


Machine Learning Model: Instantiating, Training, Predicting And Evalueting


In [52]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Step 2: Instantiate the Logistic Regression model
logistic_regression_model = LogisticRegression(random_state=42)

# Step 3: Train the model
logistic_regression_model.fit(x_train, y_train)

# Step 4: Predict on the test set
y_pred = logistic_regression_model.predict(x_test)

# Step 5: Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Step 6: Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 7: Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.8938344651381699

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.88      0.89      7407
           1       0.88      0.91      0.90      7466

    accuracy                           0.89     14873
   macro avg       0.89      0.89      0.89     14873
weighted avg       0.89      0.89      0.89     14873


Confusion Matrix:
[[6504  903]
 [ 676 6790]]


The Logistic Regression model achieved an accuracy of approximately 89.4%,  is reasonably good.


In the classification report:

Precision represents the accuracy of positive predictions. For class 0 (negative sentiment), the precision is 91%, and for class 1 (positive sentiment), it is 88%.

Recall measures the ability of the model to capture all relevant instances. For class 0, the recall is 88%, and for class 1, it is 91%.

F1-score is the harmonic mean of precision and recall. For both classes, the F1-score is around 0.89.

Support indicates the number of actual occurrences of each class in the test set. Class 0 has 7,407 samples, and class 1 has 7,466 samples.

Accuracy is the overall correct predictions divided by the total number of predictions, resulting in an accuracy of 89%.

In summary, the classification report provides a detailed assessment of the model's performance, showing a balanced performance for both positive and negative sentiments, with an overall accuracy of 89%.


In the confusion matrix:

True Positive (TP): 6,790 instances were correctly predicted as positive.
True Negative (TN): 6,504 instances were correctly predicted as negative.
False Positive (FP): 903 instances were incorrectly predicted as positive.
False Negative (FN): 676 instances were incorrectly predicted as negative.
In summary, the model performed well, with a balanced number of correct predictions for both positive and negative sentiments. The confusion matrix provides a detailed breakdown of the model's classification performance.

Predicting A New Review


In [53]:
# New reviews
new_reviews = ["I loved this movie!", "This movie was a bad comedy movie!"]

# Preprocess the new reviews
preprocessed_reviews = [simple_preprocessing(review) for review in new_reviews]

# Vectorize the preprocessed reviews using the TF-IDF vectorizer
new_reviews_vectorized = tfidf_vectorizer.transform(preprocessed_reviews)

# Predict the sentiment using the trained Logistic Regression model
predictions = logistic_regression_model.predict(new_reviews_vectorized)

# Map the predictions to sentiment labels
sentiments = ['negative' if pred == 0 else 'positive' for pred in predictions]

# Print the results
for review, sentiment in zip(new_reviews, sentiments):
    print(f"Review: '{review}'\nPredicted Sentiment: {sentiment}\n")


Review: 'I loved this movie!'
Predicted Sentiment: positive

Review: 'This movie was a bad comedy movie!'
Predicted Sentiment: negative



Bonus

In [54]:
# New phrases
new_phrases = [
    "This is the best movie I've ever seen!",
    "I couldn't stand watching this film. It was awful.",
    "The plot was confusing, but the acting was superb.",
    "Not a fan of this movie. Disappointed with the storyline.",
    "Absolutely loved the characters and the storyline!",
]

# Preprocess the new phrases
preprocessed_phrases = [simple_preprocessing(phrase) for phrase in new_phrases]

# Vectorize the preprocessed phrases using the TF-IDF vectorizer
new_phrases_vectorized = tfidf_vectorizer.transform(preprocessed_phrases)

# Predict the sentiment using the trained Logistic Regression model
predictions_phrases = logistic_regression_model.predict(new_phrases_vectorized)

# Map the predictions to sentiment labels
sentiments_phrases = ['negative' if pred == 0 else 'positive' for pred in predictions_phrases]

# Print the results
for phrase, sentiment in zip(new_phrases, sentiments_phrases):
    print(f"Phrase: '{phrase}'\nPredicted Sentiment: {sentiment}\n")


Phrase: 'This is the best movie I've ever seen!'
Predicted Sentiment: positive

Phrase: 'I couldn't stand watching this film. It was awful.'
Predicted Sentiment: negative

Phrase: 'The plot was confusing, but the acting was superb.'
Predicted Sentiment: positive

Phrase: 'Not a fan of this movie. Disappointed with the storyline.'
Predicted Sentiment: positive

Phrase: 'Absolutely loved the characters and the storyline!'
Predicted Sentiment: positive

