In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
! pip install emoji

In [None]:
import pandas as pd
import numpy as np
import re
import emoji
import spacy
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

nlp = spacy.load("en_core_web_sm")


In [None]:
def clean_text_spacy(text):
    if pd.isna(text):
        return ""

    text = str(text)
    text = emoji.demojize(text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower().strip()
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)


In [None]:
file_path = "/content/drive/MyDrive/Python/shree/Text_emoji_label/data.new.xlsx"

try:
    if file_path.endswith(".xlsx") or file_path.endswith(".xls"):
        df = pd.read_excel(file_path)
    elif file_path.endswith(".csv"):
        df = pd.read_csv(file_path, delimiter=",", on_bad_lines="skip", encoding="utf-8")
    else:
        raise ValueError("Unsupported file format! Please provide a CSV or Excel file.")
except Exception as e:
    raise ValueError(f"Error loading dataset: {str(e)}")


In [None]:
if "text" not in df.columns or "label" not in df.columns:
    raise ValueError("Dataset must contain 'text' and 'label' columns!")

df = df.dropna(subset=["label"])  

label_mapping = {"ham": 0, "spam": 1}  
df["label"] = df["label"].astype(str).str.lower().map(label_mapping)

df = df.dropna(subset=["label"])
df["label"] = df["label"].astype(int)


In [None]:
df["text"] = df["text"].fillna("") 
df["clean_text"] = df["text"].apply(clean_text_spacy)

df = df[df["clean_text"].str.strip() != ""]

if df.empty:
    raise ValueError("No valid samples found after text cleaning. Please check your dataset.")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(6, 4))
sns.countplot(x=df["label"], palette=["blue", "red"])
plt.xticks(ticks=[0, 1], labels=["Ham (Not Spam)", "Spam"])
plt.xlabel("Message Type")
plt.ylabel("Count")
plt.title("Distribution of Spam vs. Ham Messages")
plt.show()


In [None]:
from wordcloud import WordCloud

spam_words = " ".join(df[df["label"] == 1]["clean_text"])
spam_wordcloud = WordCloud(width=600, height=400, background_color="grey").generate(spam_words)

fig,ax= plt.subplots(figsize=(12, 6))

ax.imshow(spam_wordcloud, interpolation="bilinear")
ax.set_title("Spam Word Cloud")
ax.axis("off")

plt.show()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

ham_words = " ".join(df[df["label"] == 0]["clean_text"])
ham_wordcloud = WordCloud(width=600, height=400, background_color="white").generate(ham_words)

fig, ax = plt.subplots(figsize=(12, 6))  

ax.imshow(ham_wordcloud, interpolation="bilinear")
ax.set_title("Ham (Not Spam) Word Cloud")
ax.axis("off")  
plt.show()


In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

def get_top_bigrams(texts, top_n=15):
    vectorizer = CountVectorizer(ngram_range=(2,2), stop_words="english")
    X = vectorizer.fit_transform(texts)
    bigram_counts = X.toarray().sum(axis=0)
    bigram_names = vectorizer.get_feature_names_out()

    return pd.DataFrame(sorted(zip(bigram_names, bigram_counts), key=lambda x: x[1], reverse=True)[:top_n],
                        columns=["Bigram", "Count"])

spam_bigrams = get_top_bigrams(df[df["label"] == 1]["clean_text"])
ham_bigrams = get_top_bigrams(df[df["label"] == 0]["clean_text"])

fig, ax = plt.subplots(figsize=(12, 6))

sns.barplot(y=spam_bigrams["Bigram"], x=spam_bigrams["Count"], palette="Reds_r")
ax.set_title("Top 15 Bigrams in Spam Messages")
ax.set_xlabel("Count")


plt.tight_layout()
plt.show()


In [None]:
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

def get_top_bigrams(texts, top_n=15):
    vectorizer = CountVectorizer(ngram_range=(2,2), stop_words="english")
    X = vectorizer.fit_transform(texts)
    bigram_counts = X.toarray().sum(axis=0)
    bigram_names = vectorizer.get_feature_names_out()

    return pd.DataFrame(sorted(zip(bigram_names, bigram_counts), key=lambda x: x[1], reverse=True)[:top_n],
                        columns=["Bigram", "Count"])

spam_bigrams = get_top_bigrams(df[df["label"] == 1]["clean_text"])
ham_bigrams = get_top_bigrams(df[df["label"] == 0]["clean_text"])

fig, ax = plt.subplots(figsize=(12, 6))



sns.barplot(y=ham_bigrams["Bigram"], x=ham_bigrams["Count"], palette="Blues_r")
ax.set_title("Top 15 Bigrams in Ham Messages")
ax.set_xlabel("Count")

plt.tight_layout()
plt.show()


In [None]:
df["text_length"] = df["text"].apply(len)

plt.figure(figsize=(8, 6))
sns.histplot(df[df["label"] == 1]["text_length"], color="red", label="Spam", kde=True, bins=30)
plt.xlabel("Message Length")
plt.ylabel("Count")
plt.title("Message Length Distribution (Spam vs. Ham)")
plt.legend()
plt.show()


In [None]:
df["text_length"] = df["text"].apply(len)

plt.figure(figsize=(8, 6))

sns.histplot(df[df["label"] == 0]["text_length"], color="blue", label="Ham", kde=True, bins=30)
plt.xlabel("Message Length")
plt.ylabel("Count")
plt.title("Message Length Distribution (Spam vs. Ham)")
plt.legend()
plt.show()


In [None]:
import string

special_chars = ["%"]
for char in special_chars:
    df[f"count_{char}"] = df["text"].apply(lambda x: x.count(char))

plt.figure(figsize=(8, 6))
df.groupby("label")[["count_%"]].mean().T.plot(kind="bar", figsize=(10, 6), colormap="coolwarm")
plt.title("Average Special Character Count in Spam vs. Ham")
plt.ylabel("Average Count")
plt.xticks(rotation=0)
plt.show()

In [None]:
import string

special_chars = ["$"]
for char in special_chars:
    df[f"count_{char}"] = df["text"].apply(lambda x: x.count(char))

plt.figure(figsize=(8, 6))
df.groupby("label")[["count_$"]].mean().T.plot(kind="bar", figsize=(10, 6), colormap="coolwarm")
plt.title("Average Special Character Count in Spam vs. Ham")
plt.ylabel("Average Count")
plt.xticks(rotation=0)
plt.show()


In [None]:
import string

special_chars = ["!"]
for char in special_chars:
    df[f"count_{char}"] = df["text"].apply(lambda x: x.count(char))

plt.figure(figsize=(8, 6))
df.groupby("label")[["count_!"]].mean().T.plot(kind="bar", figsize=(10, 6), colormap="coolwarm")
plt.title("Average Special Character Count in Spam vs. Ham")
plt.ylabel("Average Count")
plt.xticks(rotation=0)
plt.show()

In [None]:
from textblob import TextBlob

df["sentiment"] = df["clean_text"].apply(lambda x: TextBlob(x).sentiment.polarity)

plt.figure(figsize=(8,6))
sns.histplot(df[df["label"] == 1]["sentiment"], color="red", label="Spam", kde=True, bins=30)
plt.xlabel("Sentiment Score")
plt.ylabel("Count")
plt.title("Sentiment Analysis of Spam vs. Ham")
plt.legend()
plt.show()


In [None]:
from textblob import TextBlob

df["sentiment"] = df["clean_text"].apply(lambda x: TextBlob(x).sentiment.polarity)

plt.figure(figsize=(8,6))
sns.histplot(df[df["label"] == 0]["sentiment"], color="blue", label="Ham", kde=True, bins=30)
plt.xlabel("Sentiment Score")
plt.ylabel("Count")
plt.title("Sentiment Analysis of Spam vs. Ham")
plt.legend()
plt.show()


In [None]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(df["clean_text"], df["label"], test_size=0.2, random_state=42)


In [None]:
# Convert Text to Numerical Representation using TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [None]:
# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=200, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluate Random Forest
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))


In [None]:
# Train Naïve Bayes Classifier
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
y_pred_nb = nb_model.predict(X_test_tfidf)

# Evaluate Naïve Bayes
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_nb))


In [None]:
# Train Logistic Regression
lr_model = LogisticRegression(max_iter=500)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)

# Evaluate Logistic Regression
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


In [None]:
# Train KNN Classifier
knn_model = KNeighborsClassifier(n_neighbors=50)
knn_model.fit(X_train_tfidf, y_train)
y_pred_knn = knn_model.predict(X_test_tfidf)

# Evaluate KNN
print("KNN Accuracy:", accuracy_score(y_test, y_pred_knn))
print("\nClassification Report:\n", classification_report(y_test, y_pred_knn))


In [None]:
# Train XGBoost Classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train)
y_pred_xgb = xgb_model.predict(X_test_tfidf)

# Evaluate XGBoost
print("XGBoost Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


In [None]:
from sklearn.svm import SVC

# Train SVM Classifier
svm_model = SVC(kernel='linear', probability=True, random_state=42)
svm_model.fit(X_train_tfidf, y_train)
y_pred_svm = svm_model.predict(X_test_tfidf)

# Evaluate SVM
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))


In [None]:
# Store results in a dictionary
results = {
    "Random Forest": accuracy_score(y_test, y_pred_rf),
    "Naïve Bayes": accuracy_score(y_test, y_pred_nb),
    "Logistic Regression": accuracy_score(y_test, y_pred_lr),
    "KNN": accuracy_score(y_test, y_pred_knn),
    "XGBoost": accuracy_score(y_test, y_pred_xgb),
    "SVM": accuracy_score(y_test, y_pred_svm),
}

# Print Accuracy Comparison
for model, acc in results.items():
    print(f"{model}: {acc:.4f}")


In [None]:
# Find the model with the highest accuracy
best_model_name = max(results, key=results.get)
best_accuracy = results[best_model_name]

# Print the best model and its accuracy
print(f"\n Best Model: {best_model_name}")
print(f" Accuracy: {best_accuracy:.4f}")


In [None]:
from sklearn.ensemble import VotingClassifier
import numpy as np

# Sort models by accuracy and get the top 2
top_2_models = sorted(results.items(), key=lambda x: x[1], reverse=True)[:2]

# Extract model names
top_model_1_name, top_model_2_name = top_2_models[0][0], top_2_models[1][0]
print(f" Top 2 Models for Ensemble: {top_model_1_name} & {top_model_2_name}")

# Define the selected models
model_mapping = {
    "Random Forest": rf_model,
    "Naïve Bayes": nb_model,
    "Logistic Regression": lr_model,
    "KNN": knn_model,
    "XGBoost": xgb_model,
    "SVM": svm_model,
}

# Get the top 2 model instances
model_1, model_2 = model_mapping[top_model_1_name], model_mapping[top_model_2_name]


# Final ensemble function
class EnsembleModel:
    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2

    def predict_proba(self, X):
        # Get probabilities from both models
        prob_1 = self.model_1.predict_proba(X)[:, 1]  # Probability for class 1
        prob_2 = self.model_2.predict_proba(X)[:, 1]

        # Soft voting (average probabilities)
        avg_prob = (prob_1 + prob_2) / 2
        return np.vstack([1 - avg_prob, avg_prob]).T  # Convert back to 2-column format

    def predict(self, X):
        # Convert probabilities to binary predictions
        return np.where(self.predict_proba(X)[:, 1] > 0.5, 1, 0)

# Create the ensemble model
final_model = EnsembleModel(svm_model, xgb_model)

print(" Ensemble Model (SVM + XGBoost) is ready!")

In [None]:
from sklearn.metrics import confusion_matrix

# Get predictions from ensemble model
y_pred_ensemble = final_model.predict(X_test_tfidf)

# Create confusion matrix
cm = confusion_matrix(y_test, y_pred_ensemble)

# Plot heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Ensemble Model (SVM + XGBoost)")
plt.show()


In [None]:
import pickle

model_path = "/content/drive/MyDrive/Python/shree/Text_emoji_label/ensemble_model.pkl"

# Save the trained ensemble model
with open(model_path, "wb") as model_file:
    pickle.dump(final_model, model_file)

print(f" Model saved successfully at: {model_path}")


In [None]:
import pickle

# Save the trained TF-IDF vectorizer
vectorizer_path = "/content/drive/MyDrive/Python/shree/Text_emoji_label/tfidf_vectorizer.pkl"
with open(vectorizer_path, "wb") as vec_file:
    pickle.dump(vectorizer, vec_file)

print(f" TF-IDF Vectorizer saved successfully at: {vectorizer_path}")


In [None]:
# Function to predict user input
def predict_user_input(final_model, vectorizer):
    while True:
        user_text = input("\nEnter a message to classify (or type 'exit' to quit): ")
        if user_text.lower() == "exit":
            print("Goodbye! ")
            break

        # Clean the input text
        cleaned_text = clean_text_spacy(user_text)

        # Convert to TF-IDF
        text_tfidf = vectorizer.transform([cleaned_text])

        # Predict
        prediction = final_model.predict(text_tfidf)[0]

        # Convert numeric prediction to label
        label = "Spam" if prediction == 1 else "Ham"

        print(f"Prediction: {label} ")

# Call function with best model and vectorizer
predict_user_input(final_model, vectorizer)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Store the model performance metrics
metrics = {
    "Random Forest": {
        "Accuracy": accuracy_score(y_test, y_pred_rf),
        "Precision": classification_report(y_test, y_pred_rf, output_dict=True)['1']['precision'],
        "Recall": classification_report(y_test, y_pred_rf, output_dict=True)['1']['recall'],
        "F1-Score": classification_report(y_test, y_pred_rf, output_dict=True)['1']['f1-score'],
    },
    "Naïve Bayes": {
        "Accuracy": accuracy_score(y_test, y_pred_nb),
        "Precision": classification_report(y_test, y_pred_nb, output_dict=True)['1']['precision'],
        "Recall": classification_report(y_test, y_pred_nb, output_dict=True)['1']['recall'],
        "F1-Score": classification_report(y_test, y_pred_nb, output_dict=True)['1']['f1-score'],
    },
    "Logistic Regression": {
        "Accuracy": accuracy_score(y_test, y_pred_lr),
        "Precision": classification_report(y_test, y_pred_lr, output_dict=True)['1']['precision'],
        "Recall": classification_report(y_test, y_pred_lr, output_dict=True)['1']['recall'],
        "F1-Score": classification_report(y_test, y_pred_lr, output_dict=True)['1']['f1-score'],
    },
    "KNN": {
        "Accuracy": accuracy_score(y_test, y_pred_knn),
        "Precision": classification_report(y_test, y_pred_knn, output_dict=True)['1']['precision'],
        "Recall": classification_report(y_test, y_pred_knn, output_dict=True)['1']['recall'],
        "F1-Score": classification_report(y_test, y_pred_knn, output_dict=True)['1']['f1-score'],
    },
    "XGBoost": {
        "Accuracy": accuracy_score(y_test, y_pred_xgb),
        "Precision": classification_report(y_test, y_pred_xgb, output_dict=True)['1']['precision'],
        "Recall": classification_report(y_test, y_pred_xgb, output_dict=True)['1']['recall'],
        "F1-Score": classification_report(y_test, y_pred_xgb, output_dict=True)['1']['f1-score'],
    },
    "SVM": {
        "Accuracy": accuracy_score(y_test, y_pred_svm),
        "Precision": classification_report(y_test, y_pred_svm, output_dict=True)['1']['precision'],
        "Recall": classification_report(y_test, y_pred_svm, output_dict=True)['1']['recall'],
        "F1-Score": classification_report(y_test, y_pred_svm, output_dict=True)['1']['f1-score'],
    },
}

# Convert to DataFrame for easier plotting
df_metrics = pd.DataFrame(metrics).T



In [None]:
# Plot Accuracy Comparison with values on top of bars
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=df_metrics.index, y=df_metrics["Accuracy"], palette="Blues_d")
plt.title("Model Accuracy Comparison")
plt.xlabel("Model")
plt.ylabel("Accuracy")
plt.xticks(rotation=45)

# Add accuracy values on top of each bar
for p in ax.patches:
    ax.annotate(f'{p.get_height():.4f}',  # Format the accuracy to 4 decimal places
                (p.get_x() + p.get_width() / 2., p.get_height()),  # Position it at the top of the bar
                ha='center', va='center', fontsize=12, color='black', xytext=(0, 5), textcoords='offset points')

plt.show()


In [None]:
# Plot Precision, Recall, and F1-Score using a bar chart
df_metrics[["Precision", "Recall", "F1-Score"]].plot(kind="bar", stacked=False, figsize=(12, 6), colormap="coolwarm")
plt.title("Model Performance Comparison")
plt.xlabel("Model")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.legend(title="Metrics", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
