In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
r_df = pd.read_csv(r"E:\Projects\datasets\IMDB Dataset.csv\IMDB Dataset.csv")

In [9]:
df = r_df[:500]

In [10]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
495,"""American Nightmare"" is officially tied, in my...",negative
496,"First off, I have to say that I loved the book...",negative
497,This movie was extremely boring. I only laughe...,negative
498,I was disgusted by this movie. No it wasn't be...,negative


In [5]:
y=r_df['sentiment']
x=r_df['review']

In [6]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import contractions
import emoji


In [20]:
class TextPreprocessor:
    def __init__(self , remove_stopwords=True , use_lemmatization=True):
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.remove_stopwords = remove_stopwords
        self.use_lemmatization = use_lemmatization
        self.stop_words = set(stopwords.words('english'))
        #add custom stopwords that might be irrelevant for sentiment
        self.stop_words.update(['movie' , 'film' , 'show' , 'watch' , 'seen'])

    def get_wordnet_pos(self, treebank_tag):
        """
        Map POS tag to first character lemmatize() accepts.
        """
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def clean_text(self , text):
        """Initial text cleaning"""
        #convert to lower case
        text = text.lower()

        #replace contractions
        text = contractions.fix(text)

        #convert emojis to text
        text = emoji.demojize(text)

        #remove urls
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)

        #remove special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]' , ' ' , text)

        #remove html tags
        text = re.sub(r'<.*?>','',text)

        #handle extra whitespace
        text = ' '.join(text.split())

        return text
    
    def handle_negation(self , tokens):
        """Add NOT_prefix to words following negative words until punctuation"""
        negation_words = {'not' , 'no' , 'never' , 'none' , "n't" , 'neither' , 'nor'}
        negated=[]
        negate = False

        for token in tokens:
            if token in negation_words:
                negate=True
            elif token in string.punctuation:
                negate = False
            elif negate:
                token = f'NOT_{token}'
            negated.append(token)
        
        return negated
    
    def preprocess(self , text):
        """Complete preprocessing pipeline"""

        #clean text
        text = self.clean_text(text)

        #tokenize
        tokens = word_tokenize(text)

        #handle negtaion
        tokens = self.handle_negation(tokens)

        #remove stopwords if enabled
        if self.remove_stopwords:
            tokens = [token for token in tokens if token.lower() not in self.stop_words]

        #lemmatize or stem based on configuration
        if self.use_lemmatization:
            tokens = [self.lemmatizer.lemmatize(token , self.get_wordnet_pos(token))
                      for token in tokens]
            
        else:
            tokens = [self.stemmer.stem(token) for token in tokens]

        return tokens
    
class FeatureExtractor:
    def __init__(self , max_features=5000 , ngram_range=(1,3)):
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range = ngram_range,
            min_df = 2,
            max_df=0.95
        )

        def extract_features(self , texts):
            #join tokens back into strings for Tf-IDf
            processed_texts = [' '.join(tokens) for tokens in texts]
            return self.vectorizer.fit_transform(processed_texts)
        

    #usage
def prepare_data(df):
    #initialize preprocessor
    preprocessor = TextPreprocessor(
        remove_stopwords=True,
        use_lemmatization=True
    )

    #preprocess texts
    df['preprocessed_tokens'] = df['review'].apply(preprocessor.preprocess)

    #add sentiment-related features
    df['text_length'] = df['review'].apply(len)
    df['word_count'] = df['review'].apply(lambda x: len(x.split()))
    df['avg_word_length'] = df['text_length']/df['word_count']

    #add sentiment polarity using textblob
    df['polarity'] = df['review'].apply(lambda x: TextBlob(x).sentiment.polarity)
    df['subjectivity'] = df['review'].apply(lambda x: TextBlob(x).sentiment.subjectivity)

    #extract tf-idf features
    feature_extractor = FeatureExtractor(max_features=5000 , ngram_range=(1,3))
    tfidf_features = feature_extractor.extract_features(df['processed_tokens'])

    return df , tfidf_features
        
#training pipeline
def train_model(df):
    #prepare data
    df , tfidf_features = prepare_data(df)

    #split data
    x = tfidf_features
    y = df['sentiment']

  # Add additional features
    additional_features = df[['text_length', 'word_count', 'avg_word_length', 
                            'polarity', 'subjectivity']].values
    
    x_combined = np.hstack((x.toarray() ,additional_features))

    return x_combined , y
    
        

In [21]:

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

preprocessor = TextPreprocessor(
    remove_stopwords=True,
    use_lemmatization=True
)

#prepare features and train model
x,y = train_model(df)

#split data with stratification
x_train , x_test , y_train , y_test = train_test_split(
    x , y , test_size=0.2 , random_state = 42 , stratify=y
)

#Train models
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

#create individual models
nb = MultinomialNB()
xgb_model = xgb.XGBClassifier(
    max_depth=3,
    objective='binary:logistic',
    eval_metric='logloss',
    colsample_bytree=0.7962, 
    gamma=0.2323,
    learning_rate=0.0343,
    max_depth=4,  # Changed from float to int
    min_child_weight=1.585,
    n_estimators=477,  # Changed from float to int
    subsample=0.9828
)

#create ensembles
ensemble = VotingClassifier(
    estimators=[('nb',nb) , ('xgb' , xgb_model)],
    voting='soft'
)

#train and evaluate
ensemble.fit(x_train , y_train)
y_pred = ensemble.predict(x_test)

#print metrics
print(classification_report(y_test , y_pred))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['preprocessed_tokens'] = df['review'].apply(preprocessor.preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_length'] = df['review'].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['word_count'] = df['review'].apply(lambda x: len(x.split()))
A value is trying to be s

AttributeError: 'FeatureExtractor' object has no attribute 'extract_features'

Fixed bugs

In [11]:
import re
import string
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tag import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
import contractions
import emoji

class TextPreprocessor:
    def __init__(self, remove_stopwords=True, use_lemmatization=True):
        # Download all required NLTK resources
        resources = [
            'punkt',
            'stopwords',
            'averaged_perceptron_tagger',  # Changed from averaged_perceptron_tagger_eng
            'wordnet'
        ]
        
        print("Downloading required NLTK resources...")
        for resource in resources:
            try:
                nltk.data.find(f'tokenizers/{resource}')
            except LookupError:
                print(f"Downloading {resource}...")
                nltk.download(resource, quiet=True)
            
        self.lemmatizer = WordNetLemmatizer()
        self.stemmer = PorterStemmer()
        self.remove_stopwords = remove_stopwords
        self.use_lemmatization = use_lemmatization
        self.stop_words = set(stopwords.words('english'))
        # Add custom stopwords that might be irrelevant for sentiment
        self.stop_words.update(['movie', 'film', 'show', 'watch', 'seen'])
        
    def get_wordnet_pos(self, word):
        """Map POS tag to WordNet POS tag for better lemmatization"""
        try:
            tag = pos_tag([word])[0][1][0].upper()
            tag_dict = {
                "J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV
            }
            return tag_dict.get(tag, wordnet.NOUN)
        except Exception as e:
            # If POS tagging fails, return NOUN as default
            return wordnet.NOUN
    
    def clean_text(self, text):
        """Initial text cleaning"""
        if not isinstance(text, str):
            text = str(text)
            
        # Convert to lowercase
        text = text.lower()
        
        # Replace contractions
        text = contractions.fix(text)
        
        # Convert emojis to text
        text = emoji.demojize(text)
        
        # Remove URLs
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)
        
        # Remove HTML tags
        text = re.sub(r'<.*?>', '', text)
        
        # Handle special characters and numbers
        text = re.sub(r'[^a-zA-Z\s]', ' ', text)
        
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        return text
    
    def handle_negation(self, tokens):
        """Add NOT_ prefix to words following negative words until punctuation"""
        negation_words = {'not', 'no', 'never', 'none', "n't", 'neither', 'nor'}
        negated = []
        negate = False
        
        for token in tokens:
            if token in negation_words:
                negate = True
            elif token in string.punctuation:
                negate = False
            elif negate:
                token = f'NOT_{token}'
            negated.append(token)
        return negated
    
    def preprocess(self, text):
        """Complete preprocessing pipeline"""
        try:
            # Clean text
            text = self.clean_text(text)
            
            # Tokenize
            tokens = word_tokenize(text)
            
            # Handle negation
            tokens = self.handle_negation(tokens)
            
            # Remove stopwords if enabled
            if self.remove_stopwords:
                tokens = [token for token in tokens if token.lower() not in self.stop_words]
            
            # Lemmatize or stem based on configuration
            if self.use_lemmatization:
                tokens = [self.lemmatizer.lemmatize(token, self.get_wordnet_pos(token)) 
                            for token in tokens]
            else:
                tokens = [self.stemmer.stem(token) for token in tokens]
                
            return tokens
        except Exception as e:
            print(f"Error preprocessing text: {str(e)}")
            # Return original text split into tokens as fallback
            return text.lower().split()

class FeatureExtractor:
    def __init__(self, max_features=5000, ngram_range=(1, 3)):
        self.vectorizer = TfidfVectorizer(
            max_features=max_features,
            ngram_range=ngram_range,
            min_df=2,
            max_df=0.95
        )
        
    def fit_transform(self, texts):
        """Fit and transform the texts to TF-IDF features"""
        # Join tokens back into strings for TF-IDF
        processed_texts = [' '.join(tokens) if isinstance(tokens, list) else tokens 
                         for tokens in texts]
        return self.vectorizer.fit_transform(processed_texts)
    
    def transform(self, texts):
        """Transform new texts using fitted vectorizer"""
        processed_texts = [' '.join(tokens) if isinstance(tokens, list) else tokens 
                         for tokens in texts]
        return self.vectorizer.transform(processed_texts)

def prepare_data(df):
    """Prepare data with all preprocessing steps and feature extraction"""
    # Initialize preprocessor
    preprocessor = TextPreprocessor(
        remove_stopwords=True,
        use_lemmatization=True
    )
    
    # Preprocess texts
    print("Preprocessing texts...")
    df['processed_tokens'] = df['review'].apply(preprocessor.preprocess)
    
    # Add sentiment-related features
    print("Extracting basic features...")
    df['text_length'] = df['review'].apply(len)
    df['word_count'] = df['review'].apply(lambda x: len(str(x).split()))
    df['avg_word_length'] = df['text_length'] / df['word_count']
    
    # Add sentiment polarity using TextBlob
    print("Calculating sentiment scores...")
    df['polarity'] = df['review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    df['subjectivity'] = df['review'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)
    
    # Extract TF-IDF features
    print("Extracting TF-IDF features...")
    feature_extractor = FeatureExtractor(max_features=5000, ngram_range=(1, 3))
    tfidf_features = feature_extractor.fit_transform(df['processed_tokens'])
    
    return df, tfidf_features
# Load your data
import nltk
# Download the required NLTK resource
nltk.download('averaged_perceptron_tagger')

def train_model(df):
    """Prepare data and return features and target"""
    # Prepare data
    df, tfidf_features = prepare_data(df)
    
    # Split data
    X = tfidf_features
    y = df['sentiment']
    
    # Add additional features
    additional_features = df[['text_length', 'word_count', 'avg_word_length', 
                            'polarity', 'subjectivity']].values
    
    # Combine TF-IDF with additional features
    X_combined = np.hstack((X.toarray(), additional_features))
    
    return X_combined, y

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Shiva\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [12]:

from sklearn.preprocessing import MinMaxScaler

# Prepare features and train model
X, y = train_model(df)

# Scale features to range [0, 1]
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Split data with stratification
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train models
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
import xgboost as xgb

# Create individual models
nb = MultinomialNB()
xgb_model = xgb.XGBClassifier(
    max_depth=6,
    n_estimators=200,
    learning_rate=0.1
)

# Create ensemble
ensemble = VotingClassifier(
    estimators=[('nb', nb), ('xgb', xgb_model)],
    voting='soft'
)

# Train and evaluate
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

# Print metrics
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

Downloading required NLTK resources...
Downloading stopwords...
Downloading averaged_perceptron_tagger...
Downloading wordnet...
Preprocessing texts...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['processed_tokens'] = df['review'].apply(preprocessor.preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text_length'] = df['review'].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['word_count'] = df['review'].apply(lambda x: len(str(x).split()))
A value is trying to be

Extracting basic features...
Calculating sentiment scores...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['polarity'] = df['review'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['subjectivity'] = df['review'].apply(lambda x: TextBlob(str(x)).sentiment.subjectivity)


Extracting TF-IDF features...
              precision    recall  f1-score   support

    negative       0.82      0.87      0.84        53
    positive       0.84      0.79      0.81        47

    accuracy                           0.83       100
   macro avg       0.83      0.83      0.83       100
weighted avg       0.83      0.83      0.83       100

