In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Text Processing Libraries
import re
from cleantext import clean
from wordcloud import WordCloud

# NLTK for Natural Language Processing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sklearn Libraries for Preprocessing, Model Training, and Evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, label_binarize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import roc_auc_score, roc_curve, auc, multilabel_confusion_matrix

# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

# Boosting Algorithms
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

# NLTK Downloads
nltk.download('stopwords')
nltk.download('punkt')

# For saving the model
import joblib

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# DATA EXPLORATION

In [None]:
# Loading the dataset
df = pd.read_csv('../Dataset/OriginalDataset.csv')

In [None]:
# Displaying the first few rows of the dataset
print("Dataset Overview:")
print(df.head())

In [None]:
# Shape of the dataset
print("Shape of the Dataset:", df.shape)

In [None]:
# Basic Information about the Dataset
print("Dataset Info:")
print(df.info())

In [None]:
# Summary statistics of the dataset
print("Dataset Description:")
print(df.describe())

# Exploratory Data Analysis

In [None]:
# Class distribution visualization
sns.countplot(x='label', data=df, palette='Set2')
plt.title('CLASS DISTRIBUTION')
plt.xlabel('CATEGORIES')
plt.ylabel('COUNT')
plt.show()

In [None]:
# Pie chart for class distribution
df['label'].value_counts().plot.pie(autopct='%1.1f%%', startangle=90, colors=sns.color_palette('Set2', 5))
plt.title('PROPORTION OF DIFFERENT CLASSES')
plt.ylabel('')
plt.show()

In [None]:
# Length of tweets analysis
df['tweet_length'] = df['text'].apply(len)

sns.histplot(df['tweet_length'], kde=True, color='purple')
plt.title('TWEET DISTRIBUTION ANALYSIS')
plt.xlabel('TWEET LENGTH')
plt.ylabel('FREQUENCY')
plt.show()

In [None]:
# Tweet length distribution per class
sns.boxplot(x='label', y='tweet_length', data=df, palette='Set3')
plt.title('TWEET DISTRIBUTION PER CATEGORY')
plt.xlabel('CATEGORIES')
plt.ylabel('TWEET LENGTH')
plt.show()

In [None]:
# Word cloud visualization for each class in a 2x2 grid
def generate_wordcloud_2x2(df):
    categories = df['label'].unique()
    fig, axs = plt.subplots(2, 2, figsize=(10, 6))

    axs = axs.flatten()  # Flattening to easily iterate over axes
    for i, category in enumerate(categories):
        category_data = df[df['label'] == category]['text']
        wc = WordCloud(width=800, height=400, max_words=200, background_color='white').generate(' '.join(category_data))
        axs[i].imshow(wc, interpolation='bilinear')
        axs[i].set_title(f'WORD CLOUD FOR {category}')
        axs[i].axis('off')

    plt.tight_layout()
    plt.show()

# Generate word cloud for each category in 2x2 layout
generate_wordcloud_2x2(df)

In [None]:
# Most common words in each class using CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=20)
X = vectorizer.fit_transform(df['text'])

# Barplot of most common words
common_words = vectorizer.get_feature_names_out()
word_counts = X.sum(axis=0).A1
common_word_df = pd.DataFrame({'word': common_words, 'count': word_counts})

sns.barplot(x='count', y='word', data=common_word_df.sort_values(by='count', ascending=False), palette='viridis')
plt.title('TOP 20 MOST COMMON WORDS IN TWEETS')
plt.xlabel('COUNT')
plt.ylabel('WORDS')
plt.show()

# DATA PREPROCESSING

In [None]:
# Function to clean a given sentence by removing or modifying unwanted elements
def clean_sentence(text):
    # Use the clean-text library to perform various cleaning tasks on the text
    cleaned_text = clean(
        text,
        to_ascii=True,               # Convert Unicode characters to their closest ASCII equivalent
        lower=True,                  # Convert all characters in the text to lowercase
        no_line_breaks=True,         # Remove line breaks to make the text a single line
        no_urls=True,                # Remove any URLs from the text
        no_emails=True,              # Remove email addresses
        no_phone_numbers=True,       # Remove phone numbers
        no_numbers=True,             # Remove all numeric values (e.g., "123")
        no_digits=True,              # Remove digit characters (e.g., "1", "2")
        no_currency_symbols=True,    # Remove any currency symbols (e.g., "$", "€")
        no_punct=True,               # Remove punctuation marks (e.g., ".", ",", "!")
        replace_with_punct="",       # Specify what to replace punctuation marks with
        replace_with_url="",         # Specify what to replace URLs with
        replace_with_email="",       # Specify what to replace email addresses with
        replace_with_phone_number="",# Specify what to replace phone numbers with
        replace_with_number="",      # Specify what to replace numbers with
        replace_with_digit="",       # Specify what to replace digit characters with
        replace_with_currency_symbol="", # Specify what to replace currency symbols with
        lang="en"                    # Set the language to English for language-specific cleaning
    )
    return cleaned_text

In [None]:
# Function to remove emojis and certain special symbols from a given string
def remove_emojis(data):
    # Compile a regular expression pattern to match a range of Unicode characters typically used for emojis and special symbols
    emoji_pattern = re.compile(
        "["  # Start of character set

        # Ranges for various emoji categories
        u"\U0001F600-\U0001F64F"  # Emoticons, such as smiley faces and other facial expressions
        u"\U0001F300-\U0001F5FF"  # Miscellaneous symbols, pictographs, weather icons, etc.
        u"\U0001F680-\U0001F6FF"  # Transport-related symbols, including vehicles, map symbols, etc.
        u"\U0001F1E0-\U0001F1FF"  # Regional flags, often represented in pairs for country flags
        u"\U00002500-\U00002BEF"  # Chinese characters, used in various contexts
        u"\U00002702-\U000027B0"  # Miscellaneous symbols, including checkmarks and stars
        u"\U000024C2-\U0001F251"  # Additional enclosed characters and compatibility symbols
        u"\U0001F926-\U0001F937"  # Emojis depicting human gestures and actions
        u"\U00010000-\U0010FFFF"  # Supplemental symbols and pictographs, including rare emoji
        u"\u2640-\u2642"          # Gender symbols
        u"\u2600-\u2B55"          # Various other symbols, including zodiac signs and geometric shapes
        u"\u200D"                 # Zero-width joiner, used for combining emoji sequences
        u"\u23CF"                 # Eject button symbol
        u"\u23E9"                 # Fast forward button
        u"\u231A"                 # Watch symbol
        u"\uFE0F"                 # Variation selector, used to specify emoji styles
        u"\u3030"                 # Wavy dash

        "]+"                      # End of character set, match one or more occurrences
        , re.UNICODE               # Enable Unicode matching
    )

    # Replace all matching emoji patterns in the input text with an empty string
    return re.sub(emoji_pattern, '', data)

In [None]:
# Remove stopwords
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))  # Get English stopwords
    words = text.split()  # Tokenize the text
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stopwords
    return ' '.join(filtered_words)   # Join the filtered words back into a string

In [None]:
# Remove mentions like @hello
def remove_mentions(text):
    mention_pattern = r"@\w+"   # Regular expression pattern to match mentions
    return re.sub(mention_pattern, '', text)  # Replace mentions with an empty string

In [None]:
# Function to clean the text column
def clean_text(df, col):
    df[col] = df[col].apply(func=clean_sentence)
    df[col] = df[col].apply(func=remove_emojis)
    df[col] = df[col].apply(func=remove_stopwords)
    df[col] = df[col].apply(func=remove_mentions)
    return df

In [None]:
# Apply the cleaning function to the dataset
df = clean_text(df, 'text')

In [None]:
# Displaying the first few rows of the cleaned dataset
print("Cleaned Dataset Overview:")
print(df.head())

# DATA PREPARATION


In [None]:
# Encoding labels
label_enc = LabelEncoder()
df['label_encoded'] = label_enc.fit_transform(df['label'].values)

In [None]:
# Create a dictionary for original labels and their corresponding encoded values
label_mapping_dict = dict(zip(label_enc.classes_, range(len(label_enc.classes_))))

# Print the label mapping
print("Label Mapping:", label_mapping_dict)

In [None]:
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label_encoded'], test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

In [None]:
# Creating a TF-IDF Vectorizer
tfv = TfidfVectorizer(
    min_df=3,
    strip_accents='unicode',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 3),
    sublinear_tf=True
)

In [None]:
# Fit the TF-IDF vectorizer on the training data
tfv.fit(X_train)

In [None]:
# Transform the training and test data
X_train_tfv = tfv.transform(X_train)
X_test_tfv = tfv.transform(X_test)

In [None]:
# Display the shapes of the transformed datasets
print("TF-IDF Training set shape:", X_train_tfv.shape)
print("TF-IDF Testing set shape:", X_test_tfv.shape)

In [None]:
# Saving the TF-IDF Vectorizer to a .pkl file
joblib.dump(tfv, '../Artifacts/TFIDFVectorizer.pkl')

# MODELS

## Random Forest

In [None]:
# Initialize the Random Forest Classifier
model1 = RandomForestClassifier(n_estimators=100, random_state=42)

In [None]:
# Train the model
model1.fit(X_train_tfv, y_train)

In [None]:
# Make predictions on the test set
y_pred_rf = model1.predict(X_test_tfv)

In [None]:
# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=label_enc.classes_))

In [None]:
# Accuracy score
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy Score: {accuracy_rf:.4f}")

In [None]:
# Save the trained model
joblib.dump(model1, '../Artifacts/RandomForest.pkl')

## Naive Bayes

In [None]:
# Creating and fitting the Naive Bayes model
model2 = MultinomialNB()
model2.fit(X_train_tfv, y_train)

In [None]:
# Making predictions
y_pred_nb = model2.predict(X_test_tfv)

In [None]:
# Evaluating the model
print("Classification Report:")
print(classification_report(y_test, y_pred_nb, target_names=label_enc.classes_))

In [None]:
# Accuracy score
accuracy_nb = accuracy_score(y_test, y_pred_nb)
print(f"Accuracy Score: {accuracy_nb:.4f}")

In [None]:
# Save the trained model
joblib.dump(model2, '../Artifacts/NaiveBayes.pkl')

## SVM with OvO

In [None]:
# Function to train and evaluate SVM model with OvO strategy
def train_and_evaluate_svm_ovo(X_train, X_test, y_train, y_test):
    # Create a pipeline with MaxAbsScaler and SVM using OvO and probability=True
    model3 = make_pipeline(MaxAbsScaler(), svm.SVC(kernel='linear', decision_function_shape='ovo', probability=True))

    # Train the model
    model3.fit(X_train, y_train)

    # Make predictions
    y_pred_ovo = model3.predict(X_test)

    # Evaluating the model
    print("Classification Report:")
    print(classification_report(y_test, y_pred_ovo))

    # Accuracy score
    accuracy_ovo = accuracy_score(y_test, y_pred_ovo)
    print(f"Accuracy Score: {accuracy_ovo:.4f}\n")

    # Calculate ROC AUC score for multi-class classification using predict_proba
    roc_auc = roc_auc_score(y_test, model3.predict_proba(X_test), multi_class='ovr')

    # Return the trained model and accuracy
    return model3, accuracy_ovo, roc_auc

# Train and evaluate the model, and get the accuracy
model3, accuracy_ovo, roc_auc = train_and_evaluate_svm_ovo(X_train_tfv, X_test_tfv, y_train, y_test)

In [None]:
# Save the trained model
joblib.dump(model3, '../Artifacts/SVM-OvO.pkl')

## SVM with OvR

In [None]:
# Function to train and evaluate SVM model with OvR strategy
def train_and_evaluate_svm_ovr(X_train, X_test, y_train, y_test):
    # Create a pipeline with MaxAbsScaler and SVM using OvR
    model4 = make_pipeline(MaxAbsScaler(), svm.SVC(kernel='linear', decision_function_shape='ovr', probability=True))

    # Train the model
    model4.fit(X_train, y_train)

    # Make predictions
    y_pred_ovr = model4.predict(X_test)

    # Evaluating the model
    print("Classification Report:")
    print(classification_report(y_test, y_pred_ovr, target_names=label_enc.classes_))

    # Accuracy score
    accuracy_ovr = accuracy_score(y_test, y_pred_ovr)
    print(f"Accuracy Score: {accuracy_ovr:.4f}\n")

    # Calculate ROC AUC score for multi-class classification using predict_proba
    roc_auc = roc_auc_score(y_test, model4.predict_proba(X_test), multi_class='ovr')

    # Return the trained model and accuracy
    return model4, accuracy_ovr, roc_auc

# Train and evaluate the model, and get the accuracy
model4, accuracy_ovr, roc_auc = train_and_evaluate_svm_ovr(X_train_tfv, X_test_tfv, y_train, y_test)

In [None]:
# Save the trained model
joblib.dump(model4, '../Artifacts/SVM-OvR.pkl')

## XGBoost

In [None]:
# Create the DMatrix for training and testing
dtrain = xgb.DMatrix(X_train_tfv, label=y_train)
dtest = xgb.DMatrix(X_test_tfv, label=y_test)

In [None]:
# Set parameters for XGBoost
params = {
    'objective': 'multi:softmax',  # Specify multi-class classification
    'num_class': len(label_enc.classes_),  # Number of classes
    'max_depth': 6,  # Maximum tree depth
    'eta': 0.3,  # Learning rate
    'eval_metric': 'mlogloss'  # Evaluation metric
}

In [None]:
# Train the model
model5 = xgb.train(params, dtrain, num_boost_round=100)

In [None]:
# Make predictions
y_pred_xgb = model5.predict(dtest)

In [None]:
# Evaluating the model
print("Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=label_enc.classes_))

In [None]:
# Accuracy score
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy Score: {accuracy_xgb:.4f}")

In [None]:
# Save the trained model
joblib.dump(model5, '../Artifacts/XGBoost.pkl')

## LightGBM

In [None]:
# Create the LightGBM dataset
lgb_train = lgb.Dataset(X_train_tfv, label=y_train)
lgb_test = lgb.Dataset(X_test_tfv, label=y_test)

In [None]:
# Set parameters for LightGBM
lgb_params = {
    'objective': 'multiclass',
    'num_class': len(label_enc.classes_),  # Number of classes
    'metric': 'multi_logloss',  # Evaluation metric
    'boosting_type': 'gbdt',  # Gradient Boosting Decision Tree
    'num_leaves': 31,  # Number of leaves in one tree
    'learning_rate': 0.05,  # Learning rate
    'feature_fraction': 0.9  # Fraction of features to use
}

In [None]:
# Train the model
model6 = lgb.train(lgb_params, lgb_train, num_boost_round=100)

In [None]:
# Make predictions
y_pred_lgb = model6.predict(X_test_tfv)
y_pred_lgb_classes = [np.argmax(x) for x in y_pred_lgb]  # Get the class with the highest probability

In [None]:
# Evaluating the model
print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgb_classes, target_names=label_enc.classes_))

In [None]:
# Accuracy score
accuracy_lgb = accuracy_score(y_test, y_pred_lgb_classes)
print(f"Accuracy Score: {accuracy_lgb:.4f}")

In [None]:
# Save the trained model
joblib.dump(model6, '../Artifacts/LightGBM.pkl')

## CatBoost

In [None]:
# Initialize CatBoost Classifier
model7 = CatBoostClassifier(
    iterations=100,
    depth=6,
    learning_rate=0.3,
    loss_function='MultiClass',
    eval_metric='MultiClass',
    random_seed=42
)

In [None]:
# Train the model
model7.fit(X_train_tfv, y_train)

In [None]:
# Make predictions
y_pred_catboost = model7.predict(X_test_tfv)

In [None]:
# Evaluating the model
print("Classification Report:")
print(classification_report(y_test, y_pred_catboost, target_names=label_enc.classes_))

In [None]:
# Accuracy score
accuracy_catboost = accuracy_score(y_test, y_pred_catboost)
print(f"Accuracy Score: {accuracy_catboost:.4f}")

In [None]:
# Save the trained model
joblib.dump(model7, '../Artifacts/CatBoost.pkl')

# MODEL COMPARISON


In [None]:
# Dictionary to store model names and their corresponding accuracies
model_accuracies = {
    'Random Forest': accuracy_rf,
    'Naive Bayes': accuracy_nb,
    'SVM (OvO)': accuracy_ovo,
    'SVM (OvR)': accuracy_ovr,
    'XGBoost': accuracy_xgb,
    'LightGBM': accuracy_lgb,
    'CatBoost': accuracy_catboost
}

In [None]:
# Extract model names and their accuracies
models = list(model_accuracies.keys())
accuracies = list(model_accuracies.values())

In [None]:
# Create a bar graph
sns.barplot(x=models, y=accuracies, palette='Set2')

plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracies', pad=20)
plt.xticks(rotation=90)

In [None]:
# Annotate bars with accuracy values
for i, v in enumerate(accuracies):
    plt.text(i, v + 0.01, f"{v:.4f}", ha='center', va='bottom')

In [None]:
# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Find the model with the highest accuracy
best_model = max(model_accuracies, key=model_accuracies.get)
best_accuracy = model_accuracies[best_model]

In [None]:
# Print the best model and its accuracy
print(f"Best Model: {best_model} - Accuracy: {best_accuracy:.4f}")

# MODEL EVALUATION

In [None]:
# Load the model using joblib 
model = joblib.load('../Artifacts/SVM-OvO.pkl')

In [None]:
# Predict on the test set 
y_pred_best = model.predict(X_test_tfv) 

In [None]:
# Calculate evaluation metrics 
accuracy = accuracy_score(y_test, y_pred_best) 
precision = precision_score(y_test, y_pred_best, average='weighted') 
recall = recall_score(y_test, y_pred_best, average='weighted') 
f1 = f1_score(y_test, y_pred_best, average='weighted') 
roc_auc = roc_auc_score(y_test, model.predict_proba(X_test_tfv), multi_class='ovr') 

In [None]:
# Print metrics 
print(f'Accuracy: {accuracy:.4f}') 
print(f'Precision: {precision:.4f}') 
print(f'Recall: {recall:.4f}') 
print(f'F1 Score: {f1:.4f}') 
print(f'ROC AUC: {roc_auc:.4f}') 

In [None]:
# Function to display confusion matrix for multilabel classification with class names 
def plot_confusion_matrix(model, X_test, y_test, label_encoder):
    y_pred = model.predict(X_test_tfv) 
    cm = multilabel_confusion_matrix(y_test, y_pred) 
    class_names = label_encoder.classes_  # Get class names from the label encoder

    # Plot confusion matrix for each class 
    for i, matrix in enumerate(cm): 
        ax = sns.heatmap(matrix, annot=True, fmt='d', cmap='viridis') 
        plt.title(f'Confusion Matrix for {class_names[i]}') 
        plt.xlabel('Predicted') 
        plt.ylabel('Actual') 
        cbar = ax.collections[0].colorbar 
        cbar.ax.tick_params() 
        plt.show()

In [None]:
# Call the function to plot the confusion matrix with class names 
plot_confusion_matrix(model, X_test, y_test, label_enc)

In [None]:
# Binarize the output labels for multi-class ROC 
y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))

In [None]:
# Get the decision function scores (SVM decision function for each class) 
y_score = model.decision_function(X_test_tfv) 

In [None]:
# Function to plot ROC curve for multiclass classification with customizable fonts and class names
def plot_multiclass_roc_curve(y_test_binarized, y_score, class_names): 
    plt.figure(figsize=(10, 8)) 
    
    # Plot ROC curve for each class
    for i, class_name in enumerate(class_names): 
        fpr, tpr, _ = roc_curve(y_test_binarized[:, i], y_score[:, i]) 
        roc_auc = auc(fpr, tpr) 
        plt.plot(fpr, tpr, label=f'ROC curve for {class_name} (area = {roc_auc:.2f})') 
    
    # Plot the diagonal line for reference 
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--') 
    
    plt.xlabel('False Positive Rate') 
    plt.ylabel('True Positive Rate') 
    plt.title('Multiclass ROC Curve') 
    plt.legend(loc='lower right') 
    plt.grid() 
    plt.show() 

# Get decoded class names 
class_names = label_enc.inverse_transform(np.unique(y_test)) 

# Plot the ROC curve for multiclass classification 
plot_multiclass_roc_curve(y_test_binarized, y_score, class_names)

# SAMPLE PREDICTIONS

In [None]:
# Randomly select five samples from the training data (using X_train)
sample_indices = np.random.choice(len(X_train), size=30, replace=False)
sample_texts = X_train.iloc[sample_indices].tolist()
actual_labels = y_train.iloc[sample_indices].tolist()

In [None]:
# Clean the sample texts using the same cleaning functions applied earlier
cleaned_sample_texts = [clean_sentence(text) for text in sample_texts]
cleaned_sample_texts = [remove_emojis(text) for text in cleaned_sample_texts]
cleaned_sample_texts = [remove_stopwords(text) for text in cleaned_sample_texts]
cleaned_sample_texts = [remove_mentions(text) for text in cleaned_sample_texts]

In [None]:
# Transform the cleaned texts using the TF-IDF vectorizer
sample_tfv = tfv.transform(cleaned_sample_texts)

In [None]:
# Make predictions on the sample texts
predictions = model.predict(sample_tfv)

In [None]:
# Decode the predictions back to original labels
decoded_predictions = label_enc.inverse_transform(predictions)

In [None]:
# Display the predictions in the desired format
for i in range(len(sample_texts)):
    print(f"TEXT: {sample_texts[i]} \nACTUAL: {label_enc.inverse_transform([actual_labels[i]])[0]} \nPREDICTED: {decoded_predictions[i]}\n")

In [None]:
# Initialize a list to store the prediction history
prediction_history = []

In [None]:
# Function to display the menu
def display_menu():
    print("\nMenu:")
    print("1. Enter a sentence for prediction")
    print("2. View prediction history")
    print("0. Quit")

In [None]:
# Function to get user input for predictions
def get_user_input():
    return input("\nEnter a sentence for prediction: ")

In [None]:
# Main program loop
while True:
    display_menu()
    choice = input("\nPlease select an option (0, 1, or 2): ").strip()

    if choice == '1':
        # Get user input for prediction
        user_input = get_user_input()

        # Clean the user-input text (assuming these functions are defined)
        cleaned_user_sentence = clean_sentence(user_input)
        cleaned_user_sentence = remove_emojis(cleaned_user_sentence)
        cleaned_user_sentence = remove_stopwords(cleaned_user_sentence)
        cleaned_user_sentence = remove_mentions(cleaned_user_sentence)

        # Transform the cleaned text using the TF-IDF vectorizer
        sample_tfv = tfv.transform([cleaned_user_sentence])

        # Make prediction on the user-input text
        prediction = model.predict(sample_tfv)

        # Decode the prediction back to the original label
        decoded_prediction = label_enc.inverse_transform(prediction)[0]

        # Store the input and prediction in the history
        prediction_history.append({'Text': user_input, 'Predicted Label': decoded_prediction})

        # Display the prediction
        print(f"\nPrediction: The entered text is classified as '{decoded_prediction}'")

    elif choice == '2':
        # Display the prediction history
        if prediction_history:
            predicted_df = pd.DataFrame(prediction_history)
            print("\nPrediction History:")
            print(predicted_df)
        else:
            print("\nNo predictions made yet")

    elif choice == '0':
        print("\nGoodbye!")
        break

    else:
        print("\nInvalid choice")