In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical



In [2]:
def preprocess_text(text):
    if isinstance(text, str):  # Ensure input is a string
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
        text = text.strip()  # Remove leading/trailing spaces
        text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
        return text
    return ""

In [3]:
df = pd.read_csv(r"petition_train_data_org.csv", encoding='latin1')
df.head()

Unnamed: 0,Petition Department,Petition Description,Status,Unnamed: 3
0,Women & Child Welfare,Address complaints of harassment in hostel acc...,Urgent,109
1,Education Department,No emergency medical facilities in schools for...,Urgent,336
2,Women & Child Welfare,Victims of digital harassment find it difficul...,Urgent,444
3,Women & Child Welfare,Lack of proper childcare facilities,Important,1553
4,Education Department,Lack of functional science laboratories severe...,Urgent,192


In [4]:
df.isnull().sum()

Petition Department     4
Petition Description    4
Status                  4
Unnamed: 3              0
dtype: int64

In [5]:
df = df.dropna()

In [6]:
df.isnull().sum()

Petition Department     0
Petition Description    0
Status                  0
Unnamed: 3              0
dtype: int64

In [7]:
df["Status"].value_counts()

Status
Urgent       1695
Important     885
Name: count, dtype: int64

In [8]:
# Apply preprocessing
df["clean_text"] = df["Petition Description"].apply(preprocess_text)



In [9]:
# Encode labels
label_encoder_dept = LabelEncoder()
df["department_encoded"] = label_encoder_dept.fit_transform(df["Petition Department"])



In [10]:
label_encoder_urgency = LabelEncoder()
df["urgency_encoded"] = label_encoder_urgency.fit_transform(df['Status'])



In [11]:
# Splitting data
X = df["clean_text"]
y_dept = df["department_encoded"]
y_urgency = df["urgency_encoded"]



In [12]:
X_train, X_test, y_train_dept, y_test_dept = train_test_split(X, y_dept, test_size=0.2, random_state=42)
X_train, X_test, y_train_urgency, y_test_urgency = train_test_split(X, y_urgency, test_size=0.2, random_state=42)


In [13]:

# Model selection
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42,max_depth=8,max_leaf_nodes=5),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "SVM": SVC(kernel='linear', random_state=42),
    "Naive Bayes": MultinomialNB()
}


In [14]:

best_model_dept, best_acc_dept = None, 0
best_model_urgency, best_acc_urgency = None, 0

vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)



In [15]:
results = []

best_model_dept, best_acc_dept = None, 0
best_model_urgency, best_acc_urgency = None, 0

# Train and evaluate ML models
for name, model in models.items():
    # Department Classification
    model.fit(X_train_tfidf, y_train_dept)
    y_pred_dept = model.predict(X_test_tfidf)
    
    acc_dept = accuracy_score(y_test_dept, y_pred_dept)
    precision_dept, recall_dept, f1_dept, _ = precision_recall_fscore_support(y_test_dept, y_pred_dept, average='weighted')

    results.append(["Department", name, acc_dept, precision_dept, recall_dept, f1_dept])
    
    if acc_dept > best_acc_dept:
        best_acc_dept = acc_dept
        best_model_dept = model

    
    model.fit(X_train_tfidf, y_train_urgency)
    y_pred_urgency = model.predict(X_test_tfidf)
    
    acc_urgency = accuracy_score(y_test_urgency, y_pred_urgency)
    precision_urgency, recall_urgency, f1_urgency, _ = precision_recall_fscore_support(y_test_urgency, y_pred_urgency, average='weighted')

    results.append(["Urgency", name, acc_urgency, precision_urgency, recall_urgency, f1_urgency])
    
    if acc_urgency > best_acc_urgency:
        best_acc_urgency = acc_urgency
        best_model_urgency = model

In [16]:
# LSTM Model - Deep Learning
max_words = 5000
max_len = 50

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)

y_train_dept_cat = to_categorical(y_train_dept)
y_test_dept_cat = to_categorical(y_test_dept)
y_train_urgency_cat = to_categorical(y_train_urgency)
y_test_urgency_cat = to_categorical(y_test_urgency)


In [17]:
lstm_dept = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(len(label_encoder_dept.classes_), activation='softmax')
])

lstm_dept.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_dept.fit(X_train_seq, y_train_dept_cat, epochs=5, batch_size=16, validation_data=(X_test_seq, y_test_dept_cat))

Epoch 1/5




[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 26ms/step - accuracy: 0.7650 - loss: 0.5102 - val_accuracy: 0.9438 - val_loss: 0.1554
Epoch 2/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9604 - loss: 0.1114 - val_accuracy: 0.9690 - val_loss: 0.0878
Epoch 3/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9775 - loss: 0.0578 - val_accuracy: 0.9729 - val_loss: 0.0674
Epoch 4/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9859 - loss: 0.0403 - val_accuracy: 0.9787 - val_loss: 0.0633
Epoch 5/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 23ms/step - accuracy: 0.9767 - loss: 0.0573 - val_accuracy: 0.9845 - val_loss: 0.0502


<keras.src.callbacks.history.History at 0x1e39c6b6f90>

In [18]:
# Evaluate LSTM for Department
lstm_acc_dept = lstm_dept.evaluate(X_test_seq, y_test_dept_cat, verbose=0)[1]
y_pred_lstm_dept = np.argmax(lstm_dept.predict(X_test_seq), axis=1)
precision_lstm_dept, recall_lstm_dept, f1_lstm_dept, _ = precision_recall_fscore_support(y_test_dept, y_pred_lstm_dept, average='weighted')

results.append(["Department", "LSTM", lstm_acc_dept, precision_lstm_dept, recall_lstm_dept, f1_lstm_dept])

[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step


In [19]:
# LSTM Model for Urgency Classification
lstm_urgency = Sequential([
    Embedding(max_words, 128, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(len(label_encoder_urgency.classes_), activation='softmax')
])

lstm_urgency.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_urgency.fit(X_train_seq, y_train_urgency_cat, epochs=5, batch_size=16, validation_data=(X_test_seq, y_test_urgency_cat))

Epoch 1/5




[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 28ms/step - accuracy: 0.7062 - loss: 0.5621 - val_accuracy: 0.8992 - val_loss: 0.2575
Epoch 2/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 25ms/step - accuracy: 0.9434 - loss: 0.1708 - val_accuracy: 0.9516 - val_loss: 0.1455
Epoch 3/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.9745 - loss: 0.0712 - val_accuracy: 0.9632 - val_loss: 0.0856
Epoch 4/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.9762 - loss: 0.0578 - val_accuracy: 0.9729 - val_loss: 0.0573
Epoch 5/5
[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 29ms/step - accuracy: 0.9843 - loss: 0.0383 - val_accuracy: 0.9826 - val_loss: 0.0366


<keras.src.callbacks.history.History at 0x1e3a5f6b6e0>

In [20]:
# Evaluate LSTM for Urgency
lstm_acc_urgency = lstm_urgency.evaluate(X_test_seq, y_test_urgency_cat, verbose=0)[1]
y_pred_lstm_urgency = np.argmax(lstm_urgency.predict(X_test_seq), axis=1)
precision_lstm_urgency, recall_lstm_urgency, f1_lstm_urgency, _ = precision_recall_fscore_support(y_test_urgency, y_pred_lstm_urgency, average='weighted')

results.append(["Urgency", "LSTM", lstm_acc_urgency, precision_lstm_urgency, recall_lstm_urgency, f1_lstm_urgency])


[1m17/17[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step


In [21]:
# Create a DataFrame with results
df_results = pd.DataFrame(results, columns=["Task", "Model", "Accuracy", "Precision", "Recall", "F1-Score"])
print(df_results)

         Task              Model  Accuracy  Precision    Recall  F1-Score
0  Department      Random Forest  0.722868   0.802606  0.722868  0.636678
1     Urgency      Random Forest  0.678295   0.782598  0.678295  0.555958
2  Department  Gradient Boosting  0.974806   0.974919  0.974806  0.974651
3     Urgency  Gradient Boosting  0.885659   0.885836  0.885659  0.882874
4  Department                SVM  0.978682   0.978655  0.978682  0.978664
5     Urgency                SVM  0.963178   0.963324  0.963178  0.962914
6  Department        Naive Bayes  0.941860   0.941799  0.941860  0.941227
7     Urgency        Naive Bayes  0.918605   0.920411  0.918605  0.916709
8  Department               LSTM  0.984496   0.984528  0.984496  0.984443
9     Urgency               LSTM  0.982558   0.982540  0.982558  0.982545


In [22]:
# Select LSTM as the best model for both tasks
best_model_dept = lstm_dept
best_model_urgency = lstm_urgency

# Save models and tokenizer for Flask Deployment
with open("department_model.pkl", "wb") as f:
    pickle.dump(best_model_dept, f)

with open("urgency_model.pkl", "wb") as f:
    pickle.dump(best_model_urgency, f)

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("label_encoder_dept.pkl", "wb") as f:
    pickle.dump(label_encoder_dept, f)

with open("label_encoder_urgency.pkl", "wb") as f:
    pickle.dump(label_encoder_urgency, f)

# Save evaluation results to CSV
df_results.to_csv("model_evaluation_results.csv", index=False)

print("LSTM models and required files saved successfully!")


LSTM models and required files saved successfully!


In [23]:
def classify_petition(text):
    text = preprocess_text(text) 

    # Tokenize and pad input text
    text_seq = tokenizer.texts_to_sequences([text])
    text_padded = pad_sequences(text_seq, maxlen=50)  

    # Predict department using LSTM
    dept_pred = np.argmax(best_model_dept.predict(text_padded), axis=1)[0]
    department = label_encoder_dept.inverse_transform([dept_pred])[0]

    # Predict urgency using LSTM
    urgency_pred = np.argmax(best_model_urgency.predict(text_padded), axis=1)[0]
    urgency = label_encoder_urgency.inverse_transform([urgency_pred])[0]

    return department, urgency


In [24]:
new_petition = "food poision"
dept, urgency = classify_petition(new_petition)
print(f"Predicted Department: {dept}, Urgency: {urgency}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Predicted Department: Education Department, Urgency: Urgent
