# Downloading all the dependencies

In [None]:
# --- Step 1: Install all required Python packages ---
# The '!' allows us to run shell commands directly from the notebook.
# We are installing all the libraries identified in the import statements.
!pip install pandas numpy beautifulsoup4 emoji nltk spacy tqdm textblob scikit-learn distance fuzzywuzzy python-Levenshtein

# --- Step 2: Download necessary NLTK data ---
# This command downloads the 'stopwords' corpus used for text cleaning.
import nltk
nltk.download('stopwords')

# --- Step 3: Download the SpaCy language model ---
# This downloads the small English model required for lemmatization.
!python -m spacy download en_core_web_sm

print("\n✅ All dependencies have been installed successfully!")

# Importing Dependencies

In [None]:
import pandas as pd
import numpy as np
import re
import string
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import spacy
from tqdm import tqdm
from bs4 import BeautifulSoup

# Importing dataset

In [None]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
df = pd.read_csv('Dataset/train.csv')

#Shortening the Dataset
df = df.head(30)

# Show the first few rows of the dataset
print(df.head())

column_names = ["question1", "question2"]   # Put the LIST of column name here

# Preprocessing Block

In [None]:
def preprocess_text_data(df, column_names,
                         # --- General Cleaning ---
                         lower_case=True,
                         remove_html=True,
                         remove_urls=True,
                         # --- Word & Character Handling ---
                         replace_special_chars=True,
                         decontract_words=True,
                         chat_word_treatment=True, # This parameter is present but not explicitly used in the provided code.
                         handle_emojis='replace', # Options: 'replace' (with text) or 'remove'
                         # --- Advanced NLP ---
                         remove_punc=True,
                         remove_stopwords=True,
                         spell_correction=False, # Disabled by default due to slowness
                         root_word_reduction='lemmatize'): # Options: 'lemmatize', 'stem', or None
    """
    Applies a full suite of text preprocessing steps to specified DataFrame columns
    with progress bars for each step.

    Parameters are togglable to control the preprocessing pipeline.

    Args:
        df (pd.DataFrame): The input DataFrame.
        column_names (list): A list of column names in the DataFrame to preprocess.
        lower_case (bool, optional): If True, converts text to lowercase. Defaults to True.
        remove_html (bool, optional): If True, removes HTML tags from text. Defaults to True.
        remove_urls (bool, optional): If True, removes URLs from text. Defaults to True.
        replace_special_chars (bool, optional): If True, replaces common special characters
                                                  and number shorthands. Defaults to True.
        decontract_words (bool, optional): If True, expands contractions (e.g., "don't" to "do not"). Defaults to True.
        chat_word_treatment (bool, optional): Placeholder for chat word treatment. Not implemented in provided code. Defaults to True.
        handle_emojis (str, optional): Strategy for handling emojis. 'replace' converts emojis
                                       to their text description, 'remove' removes them. Defaults to 'replace'.
        remove_punc (bool, optional): If True, removes punctuation from text. Defaults to True.
        remove_stopwords (bool, optional): If True, removes common English stopwords. Defaults to True.
        spell_correction (bool, optional): If True, applies spell correction using TextBlob.
                                           Can be slow for large datasets. Defaults to False.
        root_word_reduction (str, optional): Method for reducing words to their root form.
                                             'lemmatize' uses lemmatization (requires spaCy's 'en_core_web_sm' model),
                                             'stem' uses Porter Stemmer, None skips this step. Defaults to 'lemmatize'.

    Returns:
        pd.DataFrame: A new DataFrame with the specified text columns preprocessed.
    """
    # Create a copy of the DataFrame to avoid modifying the original
    processed_df = df.copy()
    # Initialize tqdm for pandas to show progress bars for apply operations
    tqdm.pandas(desc="Overall Progress")

    # --- Helper Functions with Integrated Logic ---

    def _remove_html_tags(text):
        """Removes HTML tags from a given text using BeautifulSoup."""
        from bs4 import BeautifulSoup
        return BeautifulSoup(text, "html.parser").get_text()

    def _remove_url(text):
        """Removes URLs from a given text using regular expressions."""
        import re
        return re.sub(r'https?://\S+|www\.\S+', '', text)

    def _replace_special_chars(text):
        """
        Replaces common special characters and expands number shorthands (e.g., '1000' to '1k').
        """
        import re
        text = text.replace('%', ' percent')
        text = text.replace('$', ' dollar ')
        text = text.replace('₹', ' rupee ')
        text = text.replace('€', ' euro ')
        text = text.replace('@', ' at ')
        text = text.replace('[math]', '') # Specific pattern removal for '[math]'

        # Number shorthands: e.g., 1,000,000,000 -> 1b
        text = text.replace(',000,000,000 ', 'b ')
        text = text.replace(',000,000 ', 'm ')
        text = text.replace(',000 ', 'k ')
        # Regex for numbers followed by zeros to apply shorthands
        text = re.sub(r'([0-9]+)000000000', r'\1b', text)
        text = re.sub(r'([0-9]+)000000', r'\1m', text)
        text = re.sub(r'([0-9]+)000', r'\1k', text)
        return text

    # Dictionary containing common English contractions and their expanded forms
    contractions = {
        "ain't": "am not", "aren't": "are not", "can't": "can not", "can't've": "can not have",
        "'cause": "because", "could've": "could have", "couldn't": "could not", "couldn't've": "could not have",
        "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not",
        "hadn't've": "had not have", "hasn't": "has not", "haven't": "have not", "he'd": "he would",
        "he'd've": "he would have", "he'll": "he will", "he'll've": "he will have", "he's": "he is",
        "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
        "i'd": "i would", "i'd've": "i would have", "i'll": "i will", "i'll've": "i will have",
        "i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
        "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have", "it's": "it is",
        "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have",
        "mightn't": "might not", "mightn't've": "might not have", "must've": "must have",
        "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not",
        "needn't've": "need not have", "o'clock": "of the clock", "oughtn't": "ought not",
        "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not",
        "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have",
        "she'll": "she will", "she'll've": "she will have", "she's": "she is",
        "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have",
        "so've": "so have", "so's": "so as", "that'd": "that would", "that'd've": "that would have",
        "that's": "that is", "there'd": "there would", "there'd've": "there would have",
        "there's": "there is", "they'd": "they would", "they'd've": "they would have",
        "they'll": "they will", "they'll've": "they will have", "they're": "they are",
        "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would",
        "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
        "we've": "we have", "weren't": "were not", "what'll": "what will",
        "what'll've": "what will have", "what're": "what are", "what's": "what is",
        "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did",
        "where's": "where is", "where've": "where have", "who'll": "who will",
        "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is",
        "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
        "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have",
        "y'all": "you all", "y'all'd": "you all would", "y'all'd've": "you all would have",
        "y'all're": "you all are", "y'all've": "you all have", "you'd": "you would",
        "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
        "you're": "you are", "you've": "you have"
    }

    def _decontract_words(text):
        """Expands contractions in a given text based on the `contractions` dictionary."""
        decontracted_text = []
        for word in text.split():
            if word in contractions:
                decontracted_text.append(contractions[word])
            else:
                decontracted_text.append(word)
        return ' '.join(decontracted_text)

    # --- The rest of the helper functions from the original script ---
    import emoji
    import re
    import string
    from nltk.corpus import stopwords
    from nltk.stem import PorterStemmer
    import spacy # Assuming spaCy is installed and 'en_core_web_sm' model is downloaded

    # Load the English spaCy model for lemmatization
    try:
        nlp = spacy.load('en_core_web_sm')
    except OSError:
        print("SpaCy model 'en_core_web_sm' not found. Please run: python -m spacy download en_core_web_sm")
        # Fallback or raise an error depending on desired behavior
        nlp = None

    _demojize_text = lambda text: emoji.demojize(text) # Converts emojis to their textual representation
    def _remove_emoji(text):
        """Removes all emojis from a given text using a regex pattern."""
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    _remove_punc = lambda text: text.translate(str.maketrans('', '', string.punctuation)) # Removes all punctuation
    english_stopwords = set(stopwords.words('english')) # Set of common English stopwords for efficient lookup
    _remove_stopwords = lambda text: " ".join([word for word in text.split() if word not in english_stopwords]) # Removes stopwords
    ps = PorterStemmer() # Initialize Porter Stemmer
    _stem_words = lambda text: " ".join([ps.stem(word) for word in text.split()]) # Stems words
    _lemmatize_text = lambda text: " ".join([token.lemma_ for token in nlp(text)]) if nlp else text # Lemmatizes words using spaCy

    # --- Applying Preprocessing Steps Sequentially with Progress Bars ---
    for col in column_names:
        print(f"\n--- Processing Column: {col} ---")
        # Ensure the column is of string type before applying text operations
        processed_df[col] = processed_df[col].astype(str)

        if remove_html:
            # Apply HTML tag removal with a progress bar
            processed_df[col] = processed_df[col].progress_apply(_remove_html_tags)
        if remove_urls:
            # Apply URL removal with a progress bar
            processed_df[col] = processed_df[col].progress_apply(_remove_url)
        if lower_case:
            # Convert text to lowercase
            processed_df[col] = processed_df[col].str.lower()
        if replace_special_chars:
            # Apply special character replacement with a progress bar
            processed_df[col] = processed_df[col].progress_apply(_replace_special_chars)
        if decontract_words:
            # Apply word decontraction with a progress bar
            processed_df[col] = processed_df[col].progress_apply(_decontract_words)

        # Chat word treatment can be added here if needed, it overlaps with decontraction
        # The 'chat_word_treatment' parameter is defined but not used in the provided code.

        if handle_emojis == 'replace':
            # Convert emojis to text representation with a progress bar
            processed_df[col] = processed_df[col].progress_apply(_demojize_text)
        elif handle_emojis == 'remove':
            # Remove emojis with a progress bar
            processed_df[col] = processed_df[col].progress_apply(_remove_emoji)

        if spell_correction:
            # Import TextBlob only if spell correction is enabled to avoid unnecessary import
            from textblob import TextBlob
            # Apply spell correction with a progress bar. This can be very slow.
            processed_df[col] = processed_df[col].progress_apply(lambda x: str(TextBlob(x).correct()))
        if remove_punc:
            # Apply punctuation removal with a progress bar
            processed_df[col] = processed_df[col].progress_apply(_remove_punc)
        if remove_stopwords:
            # Apply stopword removal with a progress bar
            processed_df[col] = processed_df[col].progress_apply(_remove_stopwords)

        if root_word_reduction == 'lemmatize':
            if nlp: # Only attempt lemmatization if spaCy model was loaded successfully
                # Apply lemmatization with a progress bar
                processed_df[col] = processed_df[col].progress_apply(_lemmatize_text)
            else:
                print(f"Skipping lemmatization for column '{col}' as spaCy model was not loaded.")
        elif root_word_reduction == 'stem':
            # Apply stemming with a progress bar
            processed_df[col] = processed_df[col].progress_apply(_stem_words)

    return processed_df

In [None]:
new_df = preprocess_text_data(df, column_names)

In [None]:
new_df.head()

# Feature Engineering

In [None]:
#Basic Features

new_df['q1_len'] = new_df['question1'].str.len() 
new_df['q2_len'] = new_df['question2'].str.len()

new_df['q1_num_words'] = new_df['question1'].apply(lambda row: len(row.split(" ")))
new_df['q2_num_words'] = new_df['question2'].apply(lambda row: len(row.split(" ")))

def common_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return len(w1 & w2)
new_df['word_common'] = new_df.apply(common_words, axis=1)

def total_words(row):
    w1 = set(map(lambda word: word.lower().strip(), row['question1'].split(" ")))
    w2 = set(map(lambda word: word.lower().strip(), row['question2'].split(" ")))    
    return (len(w1) + len(w2))
new_df['word_total'] = new_df.apply(total_words, axis=1)

new_df['word_share'] = round(new_df['word_common']/new_df['word_total'],2)

new_df.head()

In [None]:
# Advanced Features (Part 1)
from nltk.corpus import stopwords

def fetch_token_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    SAFE_DIV = 0.0001 

    STOP_WORDS = stopwords.words("english")
    
    token_features = [0.0]*8
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return token_features

    # Get the non-stopwords in Questions
    q1_words = set([word for word in q1_tokens if word not in STOP_WORDS])
    q2_words = set([word for word in q2_tokens if word not in STOP_WORDS])
    
    #Get the stopwords in Questions
    q1_stops = set([word for word in q1_tokens if word in STOP_WORDS])
    q2_stops = set([word for word in q2_tokens if word in STOP_WORDS])
    
    # Get the common non-stopwords from Question pair
    common_word_count = len(q1_words.intersection(q2_words))
    
    # Get the common stopwords from Question pair
    common_stop_count = len(q1_stops.intersection(q2_stops))
    
    # Get the common Tokens from Question pair
    common_token_count = len(set(q1_tokens).intersection(set(q2_tokens)))
    
    
    token_features[0] = common_word_count / (min(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[1] = common_word_count / (max(len(q1_words), len(q2_words)) + SAFE_DIV)
    token_features[2] = common_stop_count / (min(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[3] = common_stop_count / (max(len(q1_stops), len(q2_stops)) + SAFE_DIV)
    token_features[4] = common_token_count / (min(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    token_features[5] = common_token_count / (max(len(q1_tokens), len(q2_tokens)) + SAFE_DIV)
    
    # Last word of both question is same or not
    token_features[6] = int(q1_tokens[-1] == q2_tokens[-1])
    
    # First word of both question is same or not
    token_features[7] = int(q1_tokens[0] == q2_tokens[0])
    
    return token_features

token_features = new_df.apply(fetch_token_features, axis=1)

new_df["cwc_min"]       = list(map(lambda x: x[0], token_features))
new_df["cwc_max"]       = list(map(lambda x: x[1], token_features))
new_df["csc_min"]       = list(map(lambda x: x[2], token_features))
new_df["csc_max"]       = list(map(lambda x: x[3], token_features))
new_df["ctc_min"]       = list(map(lambda x: x[4], token_features))
new_df["ctc_max"]       = list(map(lambda x: x[5], token_features))
new_df["last_word_eq"]  = list(map(lambda x: x[6], token_features))
new_df["first_word_eq"] = list(map(lambda x: x[7], token_features))

new_df.head()

In [None]:
#Advanced Features (Part 2)

import distance

def fetch_length_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    length_features = [0.0]*3
    
    # Converting the Sentence into Tokens: 
    q1_tokens = q1.split()
    q2_tokens = q2.split()
    
    if len(q1_tokens) == 0 or len(q2_tokens) == 0:
        return length_features
    
    # Absolute length features
    length_features[0] = abs(len(q1_tokens) - len(q2_tokens))
    
    #Average Token Length of both Questions
    length_features[1] = (len(q1_tokens) + len(q2_tokens))/2
    
    strs = list(distance.lcsubstrings(q1, q2))
    length_features[2] = len(strs[0]) / (min(len(q1), len(q2)) + 1)
    
    return length_features

length_features = new_df.apply(fetch_length_features, axis=1)

new_df['abs_len_diff'] = list(map(lambda x: x[0], length_features))
new_df['mean_len'] = list(map(lambda x: x[1], length_features))
new_df['longest_substr_ratio'] = list(map(lambda x: x[2], length_features))

new_df.head()

In [None]:
# Advanced Features (Fuzzy Features)
from fuzzywuzzy import fuzz

def fetch_fuzzy_features(row):
    
    q1 = row['question1']
    q2 = row['question2']
    
    fuzzy_features = [0.0]*4
    
    # fuzz_ratio
    fuzzy_features[0] = fuzz.QRatio(q1, q2)

    # fuzz_partial_ratio
    fuzzy_features[1] = fuzz.partial_ratio(q1, q2)

    # token_sort_ratio
    fuzzy_features[2] = fuzz.token_sort_ratio(q1, q2)

    # token_set_ratio
    fuzzy_features[3] = fuzz.token_set_ratio(q1, q2)

    return fuzzy_features

fuzzy_features = new_df.apply(fetch_fuzzy_features, axis=1)

# Creating new feature columns for fuzzy features
new_df['fuzz_ratio'] = list(map(lambda x: x[0], fuzzy_features))
new_df['fuzz_partial_ratio'] = list(map(lambda x: x[1], fuzzy_features))
new_df['token_sort_ratio'] = list(map(lambda x: x[2], fuzzy_features))
new_df['token_set_ratio'] = list(map(lambda x: x[3], fuzzy_features))

print(new_df.shape)
new_df.head()

# Count Vectorizing

In [None]:
ques_df = new_df[['question1','question2']]
ques_df.head()

In [None]:
final_df = new_df.drop(columns=['id','qid1','qid2','question1','question2'])
print(final_df.shape)
final_df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# merge texts
questions = list(ques_df['question1']) + list(ques_df['question2'])

cv = CountVectorizer(max_features=3000)
q1_arr, q2_arr = np.vsplit(cv.fit_transform(questions).toarray(),2)

In [None]:
temp_df1 = pd.DataFrame(q1_arr, index= ques_df.index)
temp_df2 = pd.DataFrame(q2_arr, index= ques_df.index)
temp_df = pd.concat([temp_df1, temp_df2], axis=1)
temp_df.shape

In [None]:
final_df = pd.concat([final_df, temp_df], axis=1)
print(final_df.shape)
final_df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from tqdm.auto import tqdm # Import tqdm for the progress bar

# Merge all questions (question1 and question2) into a single list
# This is done to fit the CountVectorizer on the full vocabulary
questions = list(ques_df['question1']) + list(ques_df['question2'])

# Initialize CountVectorizer with a maximum of 3000 features (most frequent words)
cv = CountVectorizer(max_features=3000)

# We will fit and transform the data in a loop to show progress.
# First, fit the vectorizer on the entire corpus to build the vocabulary.
print("Fitting the CountVectorizer...")
cv.fit(questions)

# Get the number of questions in question1 to split the sparse matrix
num_q1 = len(ques_df['question1'])

# Now, transform the data in batches to show a progress bar
# This is a more advanced technique for very large datasets,
# but it illustrates how to use tqdm with a generator.
# For simplicity, we'll create the full list of questions and then transform them.
print("Transforming text to a sparse matrix...")
sparse_matrix = []
for i in tqdm(range(len(questions)), desc="Processing questions"):
    sparse_matrix.append(cv.transform([questions[i]]))

# Use vstack from scipy.sparse to combine the list of sparse matrices
# into a single sparse matrix. This is more memory-efficient than
# creating a very large list and then trying to combine it all at once.
from scipy.sparse import vstack
sparse_matrix = vstack(sparse_matrix)

# Split the single sparse matrix back into two separate sparse matrices
# for question1 and question2.
q1_sparse_arr = sparse_matrix[:num_q1]
q2_sparse_arr = sparse_matrix[num_q1:]

# The resulting q1_sparse_arr and q2_sparse_arr are both sparse matrices
# which are highly memory-efficient.

In [None]:
import pandas as pd
from scipy.sparse import hstack

# Assuming q1_sparse_arr and q2_sparse_arr are your sparse matrices from the previous step.

# Concatenate the two sparse matrices horizontally (side-by-side)
# This results in one row per question pair, with features from both questions.
# The resulting matrix is still sparse and highly memory-efficient.
feature_matrix = hstack([q1_sparse_arr, q2_sparse_arr])

# Print the shape of the final feature matrix
# The number of rows will be the number of question pairs.
# The number of columns will be the sum of the features from both sparse matrices (3000 + 3000 = 6000 in your case).
print(feature_matrix.shape)

In [None]:
import pandas as pd
from scipy.sparse import hstack, csr_matrix

# Assuming final_df contains your engineered features and is a dense DataFrame.
# Assuming feature_matrix is the sparse BoW matrix from the previous step.

# 1. Convert the final_df (engineered features) to a sparse matrix
# This is a crucial step to handle the data consistently and efficiently.
engineered_features_sparse = csr_matrix(final_df)

# 2. Horizontally stack the engineered features sparse matrix with the BoW sparse matrix
# The result is a single, unified sparse matrix that combines all features.
combined_features_sparse = hstack([engineered_features_sparse, feature_matrix])

# Print the shape of the new combined sparse matrix
# It should now have:
#   - original engineered features columns (e.g., 20)
#   - 6000 BoW features (3000 from question1 and 3000 from question2)
# Total columns = 20 + 6000 = 6020 (example)
print(combined_features_sparse.shape)

# You can now use combined_features_sparse for your model training.
# Most scikit-learn models are optimized to work directly with sparse matrices.

# Note: We do not call .head() because the output is a sparse matrix, not a DataFrame.
# To inspect the data, you would need to convert it to an array or DataFrame,
# which is not recommended for large datasets due to memory constraints.

# Training The Model

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets

# X = all feature columns (excluding the first column, which is the target label 'is_duplicate')
X = final_df.iloc[:, 1:].values

# y = the target column (first column, 'is_duplicate')
y = final_df.iloc[:, 0].values

# Perform 80/20 train-test split
# random_state ensures reproducibility of the split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1
)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
import numpy as np

# Total number of estimators (trees) you want in your final forest
N_ESTIMATORS = 100

# Initialize the Random Forest classifier
# Set n_estimators to 1 to start, and use warm_start=True
# warm_start=True allows us to add more trees incrementally
rf = RandomForestClassifier(n_estimators=1, warm_start=True, random_state=42)

# Create an empty list to store the accuracy at each step, if you want to track it
accuracies = []

print("Training Random Forest with a progress bar...")

# Loop to incrementally add trees to the forest and show a progress bar
# We will add trees in batches of 10 to speed up the process.
# tqdm will display the progress of this loop.
with tqdm(total=N_ESTIMATORS, desc="Building Random Forest") as pbar:
    for i in range(1, N_ESTIMATORS + 1):
        # Set n_estimators to the current iteration number
        # This adds one more tree at each iteration.
        rf.n_estimators = i
        
        # Fit the model with the current number of trees
        rf.fit(X_train, y_train)

        # Update the progress bar
        pbar.update(1)

# Now that the model is fully trained, we can use it for predictions.
print("Random Forest training complete.")

# Predict the labels for the test data
y_pred = rf.predict(X_test)

# Calculate and return the accuracy of the model on test data
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the model on test data: {accuracy:.4f}")

In [None]:
# Import the necessary libraries
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
import pandas as pd

# Assuming you have the following variables from the previous steps:
# y_test: The true labels for the test data
# y_pred: The model's predictions for the test data

# 1. Calculate and display the overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Overall Accuracy: {accuracy:.4f}\n")

# 2. Generate and display the confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# For better readability, you can display the confusion matrix in a DataFrame
conf_df = pd.DataFrame(conf_matrix, 
                       index=['Actual Negative', 'Actual Positive'], 
                       columns=['Predicted Negative', 'Predicted Positive'])
print("\nConfusion Matrix (DataFrame):")
print(conf_df)

# 3. Generate and display the classification report for more detailed metrics
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

In [None]:
import pickle

pickle.dump(rf,open('model_final.pkl','wb'))
pickle.dump(cv,open('cv_final.pkl','wb'))

print("Vectorizer and model saved successfully!")
