In [None]:
import numpy as np
import pandas as pd
import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Bidirectional, Dense, Dropout
from sklearn.preprocessing import LabelEncoder

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load dataset (Assumed CSV format with 'comment' and 'quality' columns)
df = pd.read_csv("/content/drive/MyDrive/feedback_data.csv")  # Replace with actual file path

# Data Cleaning & Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters & punctuation
    tokens = word_tokenize(text)  # Tokenization
    tokens = [word for word in tokens if word not in stopwords.words('english')]  # Remove stopwords
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatization
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_comments'] = df['comment'].apply(preprocess_text)


In [None]:
# Splitting Data
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_comments'], df['quality'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

# Initialize Models
models = {
    "SVM": SVC(kernel='linear', probability=True, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}


In [None]:
# Train and Evaluate Models
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_test_tfidf)
    print(f"{model_name} Classification Report:\n", classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} Confusion Matrix')
    plt.show()

In [None]:
 # AUC-ROC Curve
    if model_name == "SVM":  # Example for one model
        y_probs = model.predict_proba(X_test_tfidf)
        fpr, tpr, _ = roc_curve(pd.get_dummies(y_test).values.ravel(), y_probs.ravel())
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.2f}')
        plt.legend()
        plt.title('AUC-ROC Curve')
        plt.show()


In [None]:
  # FastText Embeddings & Deep Learning Model

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['cleaned_comments'])
sequences = tokenizer.texts_to_sequences(df['cleaned_comments'])
word_index = tokenizer.word_index

# Padding
max_length = 100
X = pad_sequences(sequences, maxlen=max_length, padding='post')

# Encode Labels
le = LabelEncoder()
y = le.fit_transform(df['quality'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# FastText Embedding
dim = 100
fasttext_model = {word: np.random.randn(dim) for word in word_index}  # Placeholder for actual FastText embedding
embedding_matrix = np.zeros((len(word_index) + 1, dim))
for word, i in word_index.items():
    if word in fasttext_model:
        embedding_matrix[i] = fasttext_model[word]

# Define GRU Model
def create_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(word_index) + 1, output_dim=dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(GRU(128, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Define Bi-GRU Model
def create_bi_gru_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(word_index) + 1, output_dim=dim, weights=[embedding_matrix], input_length=max_length, trainable=False))
    model.add(Bidirectional(GRU(128, return_sequences=False)))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='softmax'))
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Train GRU Model
gru_model = create_gru_model()
history_gru = gru_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32)

# Train Bi-GRU Model
bi_gru_model = create_bi_gru_model()
history_bi_gru = bi_gru_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=32)

In [None]:
# Plot Training & Validation Accuracy
plt.plot(history_gru.history['accuracy'], label='GRU Training Accuracy')
plt.plot(history_gru.history['val_accuracy'], label='GRU Validation Accuracy')
plt.plot(history_bi_gru.history['accuracy'], label='Bi-GRU Training Accuracy')
plt.plot(history_bi_gru.history['val_accuracy'], label='Bi-GRU Validation Accuracy')
plt.legend()
plt.title('Accuracy Over Epochs')
plt.show()

# Plot Training & Validation Loss
plt.plot(history_gru.history['loss'], label='GRU Training Loss')
plt.plot(history_gru.history['val_loss'], label='GRU Validation Loss')
plt.plot(history_bi_gru.history['loss'], label='Bi-GRU Training Loss')
plt.plot(history_bi_gru.history['val_loss'], label='Bi-GRU Validation Loss')
plt.legend()
plt.title('Loss Over Epochs')
plt.show()