In [6]:
import pandas as pd

# Import Dataset
df = pd.read_csv('C:/Users/akbet/imdb_dataset.csv')

# Display the first 5 rows
print(df.head())

# Check for missing values and data types
print(df.info())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB
None


In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# Download NLTK data
nltk.download('stopwords')

# Initialize the stemmer and stopwords
ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove HTML tags (if any)
    text = re.sub('<[^>]*>', '', text)
    # Remove special characters and digits
    text = re.sub('[^a-zA-Z]', ' ', text)
    # Convert to lowercase and split into words
    words = text.lower().split()
    # Remove stopwords and apply stemming
    processed_words = [ps.stem(word) for word in words if word not in stop_words]
    return " ".join(processed_words)

# Apply the preprocessing function to the 'review' column
df['cleaned_review'] = df['review'].apply(preprocess_text)

# Display a sample of the cleaned text
print(df['cleaned_review'].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\akbet\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


0    one review mention watch oz episod hook right ...
1    wonder littl product film techniqu unassum old...
2    thought wonder way spend time hot summer weeke...
3    basic famili littl boy jake think zombi closet...
4    petter mattei love time money visual stun film...
Name: cleaned_review, dtype: object


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Seperate features (X) and target (y)
X = df['cleaned_review']
y = df['sentiment']

# Map the sentiment labels to 1s and 0s
y = y.map({'positive': 1, 'negative': 0})

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the TF-IDF Vectorizer
# We'll limit the number of features to the 5000 most frequent words
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit the vectorizer on the training data and transform both training and testing data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Initialize the Logistic Regression model
sentiment_model = LogisticRegression(random_state=42)

# Train the model on the TF-IDF training data
sentiment_model.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = sentiment_model.predict(X_test_tfidf)

# Evaluate the model's performance
print("--- Model Performance Report ---")
print(classification_report(y_test, y_pred))

# Print the overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy:.4f}")

--- Model Performance Report ---
              precision    recall  f1-score   support

           0       0.89      0.87      0.88      7411
           1       0.88      0.90      0.89      7589

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

Overall Accuracy: 0.8865


In [15]:
# Get the feature names from the TF-IDF vectorizer
feature_names = tfidf_vectorizer.get_feature_names_out()

# Get the model coefficients
coefficients = sentiment_model.coef_.flatten()

# Create a DataFrame to link words and their coefficients
word_coefficients = pd.DataFrame({'word': feature_names, 'coefficient': coefficients})

# Sort by coefficient to find the most positive and negative words
top_positive_words = word_coefficients.sort_values(by='coefficient', ascending=False).head(20)
top_negative_words = word_coefficients.sort_values(by='coefficient', ascending=True).head(20)

print("--- Top 20 Words Predicting Positive Sentiment ---")
print(top_positive_words)
print("\n--- Top 20 Words Predicting Negative Sentiment ---")
print(top_negative_words)

--- Top 20 Words Predicting Positive Sentiment ---
           word  coefficient
1954      great     6.962201
1546      excel     6.833022
3250    perfect     5.279262
1471      enjoy     5.200069
2672       love     4.819951
540   brilliant     4.793761
140        amaz     4.581360
404        best     4.564228
1637    favorit     4.428023
2105     hilari     4.191177
1619    fantast     3.921566
371      beauti     3.802644
4348     superb     3.758992
1145    definit     3.737979
4529      today     3.692005
2101     highli     3.682685
1816        fun     3.672608
4871       well     3.591211
3251  perfectli     3.389761
4557      touch     3.368544

--- Top 20 Words Predicting Negative Sentiment ---
            word  coefficient
4952       worst    -9.862273
4842        wast    -8.742034
295           aw    -7.621664
315          bad    -7.136686
491         bore    -7.026217
4463     terribl    -5.883986
3354        poor    -5.781631
1245  disappoint    -5.604252
1599        fail  