In [44]:
import pandas as pd

df = pd.read_csv('D:\\SIH\\railmadad_complaints.csv')
df.sample(10)

Unnamed: 0,Category,Text
691,Water Availability,This is a complaint regarding water availabili...
745,Coach - Maintenance,This is a complaint regarding coach - maintena...
672,Water Availability,This is a complaint regarding water availabili...
1170,Bedroll,This is a complaint regarding bedroll. The iss...
1099,Corruption/Bribery,This is a complaint regarding corruption/bribe...
433,Coach - Cleanliness,This is a complaint regarding coach - cleanlin...
779,Coach - Maintenance,This is a complaint regarding coach - maintena...
197,Security,This is a complaint regarding security. The is...
719,Coach - Maintenance,This is a complaint regarding coach - maintena...
405,Coach - Cleanliness,This is a complaint regarding coach - cleanlin...


In [45]:
df.Category.unique()

array(['Medical Assistance', 'Security',
       'Facilities for Women with Special needs', 'Electrical Equipment',
       'Coach - Cleanliness', 'Punctuality', 'Water Availability',
       'Coach - Maintenance', 'Catering & Vending Services',
       'Staff Behaviour', 'Corruption/Bribery', 'Bedroll',
       'Miscellaneous'], dtype=object)

In [46]:
df.describe()

Unnamed: 0,Category,Text
count,1300,1300
unique,13,965
top,Medical Assistance,This is a complaint regarding punctuality. The...
freq,100,4


In [47]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [48]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['Text'], df['Category'], test_size=0.2, random_state=42)

In [49]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

In [50]:
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

XGB

In [51]:
xgb_model = XGBClassifier(eval_metric='mlogloss')
xgb_model.fit(X_train_tfidf, y_train_encoded)

In [52]:
# Predict and evaluate XGBoost
xgb_preds = xgb_model.predict(X_test_tfidf)
xgb_accuracy = accuracy_score(y_test_encoded, xgb_preds)
xgb_report = classification_report(y_test_encoded, xgb_preds, target_names=label_encoder.classes_)

print("XGBoost Accuracy:", xgb_accuracy)
print("XGBoost Classification Report:\n", xgb_report)

XGBoost Accuracy: 0.9884615384615385
XGBoost Classification Report:
                                          precision    recall  f1-score   support

                                Bedroll       0.93      1.00      0.96        13
            Catering & Vending Services       1.00      1.00      1.00        26
                    Coach - Cleanliness       1.00      0.91      0.95        23
                    Coach - Maintenance       0.88      1.00      0.94        15
                     Corruption/Bribery       1.00      1.00      1.00        18
                   Electrical Equipment       1.00      0.94      0.97        17
Facilities for Women with Special needs       1.00      1.00      1.00        26
                     Medical Assistance       1.00      1.00      1.00        19
                          Miscellaneous       1.00      1.00      1.00        21
                            Punctuality       1.00      1.00      1.00        19
                               Security

Neural Network

In [53]:
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=100, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=100, padding='post')

In [54]:
# Model 2: Neural Network (Simple LSTM)
vocab_size = 5000
embedding_dim = 64
lstm_units = 128

In [55]:
nn_model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=100),
    LSTM(lstm_units, return_sequences=True),
    GlobalAveragePooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])



In [56]:
nn_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
nn_model.fit(X_train_pad, y_train_encoded, epochs=5, batch_size=32, validation_split=0.1)

Epoch 1/5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 67ms/step - accuracy: 0.0611 - loss: 2.5678 - val_accuracy: 0.0673 - val_loss: 2.5678
Epoch 2/5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.0877 - loss: 2.5655 - val_accuracy: 0.1058 - val_loss: 2.5656
Epoch 3/5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.0857 - loss: 2.5643 - val_accuracy: 0.0385 - val_loss: 2.5641
Epoch 4/5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 54ms/step - accuracy: 0.0797 - loss: 2.5624 - val_accuracy: 0.0865 - val_loss: 2.5607
Epoch 5/5
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.0934 - loss: 2.5558 - val_accuracy: 0.0385 - val_loss: 2.5603


<keras.src.callbacks.history.History at 0x1d2ef46b9d0>

In [57]:
# Predict and evaluate Neural Network
nn_preds = nn_model.predict(X_test_pad)
nn_preds_labels = nn_preds.argmax(axis=1)
nn_accuracy = accuracy_score(y_test_encoded, nn_preds_labels)
nn_report = classification_report(y_test_encoded, nn_preds_labels, target_names=label_encoder.classes_)

print("Neural Network Accuracy:", nn_accuracy)
print("Neural Network Classification Report:\n", nn_report)

[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Neural Network Accuracy: 0.06923076923076923
Neural Network Classification Report:
                                          precision    recall  f1-score   support

                                Bedroll       0.00      0.00      0.00        13
            Catering & Vending Services       0.00      0.00      0.00        26
                    Coach - Cleanliness       0.00      0.00      0.00        23
                    Coach - Maintenance       0.00      0.00      0.00        15
                     Corruption/Bribery       0.00      0.00      0.00        18
                   Electrical Equipment       0.00      0.00      0.00        17
Facilities for Women with Special needs       0.00      0.00      0.00        26
                     Medical Assistance       0.00      0.00      0.00        19
                          Miscellaneous       0.00      0.00      0.00        21
                            Punct

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [61]:
import numpy as np

# Function to preprocess the input text
def preprocess_input(text):
    # Convert text to sequences
    sequences = tokenizer.texts_to_sequences([text])
    # Pad sequences to the same length as used in training
    padded_sequence = pad_sequences(sequences, maxlen=100, padding='post')
    return padded_sequence

# Function to predict the category
def predict_category(text):
    # Preprocess the input text
    processed_text = preprocess_input(text)
    # Predict the category using the trained model
    prediction = nn_model.predict(processed_text)
    # Get the index of the category with the highest probability
    predicted_index = np.argmax(prediction)
    # Convert index back to category label
    predicted_category = label_encoder.inverse_transform([predicted_index])
    return predicted_category[0]

# Example of getting user input and predicting the category
user_input = input("Enter your complaint: ")
predicted_category = predict_category(user_input)
print(f"Predicted Category: {predicted_category}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
Predicted Category: Staff Behaviour
