In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

data = pd.read_csv('Data.csv')


In [30]:
from nltk.corpus import stopwords
import re

In [3]:
data.drop('Unnamed: 0', inplace=True, axis=1)

In [4]:
data

Unnamed: 0,message,target
0,I'm writing to raise a serious concern about t...,1
1,"Hello, there's an ongoing issue with the water...",1
2,"Greetings, I'd like to report a problem with t...",1
3,"Hi, we've been facing a persistent problem wit...",1
4,I'm concerned about the water quality in our h...,1
...,...,...
2009,Residents have expressed growing frustration o...,0
2010,The roads in our region have fallen into a sta...,0
2011,The road conditions are an issue of paramount ...,0
2012,I would appreciate your prompt attention to th...,0


In [35]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    
    text = text.lower() # lowercase text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
    
    return text

In [36]:
data['message']=data['message'].apply(clean_text)

In [53]:
X = data['message']
y = data['target']

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
X_train

261     deteriorating cleanliness pond causing spread ...
746     drains good  cant take away water rains  thats...
1380    potholes posing significant challenges commute...
240     unclean water pond unfit animals drink  posing...
1644    hope listen concerns lack street lights making...
                              ...                        
1130    attention matter greatly appreciated  hope wor...
1294    venting  finding way end neverending cycle roa...
860     urgently implore authorities epedite clean gan...
1459    find stuck dilemma daily routine revolves arou...
1126    please provide clear transparent eplanation re...
Name: message, Length: 1611, dtype: object

In [77]:
# Create a CountVectorizer to convert text data into numerical features.
vectorizer = CountVectorizer(ngram_range=(1,3))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [80]:
X_test_vec

<403x27884 sparse matrix of type '<class 'numpy.int64'>'
	with 12011 stored elements in Compressed Sparse Row format>

In [55]:
X_train_vec

<1611x27884 sparse matrix of type '<class 'numpy.int64'>'
	with 67473 stored elements in Compressed Sparse Row format>

In [56]:
len(vectorizer.vocabulary_)

27884

In [57]:
X_train_vec.shape

(1611, 27884)

In [115]:
import tensorflow as tf
#from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D,Bidirectional
from sklearn.model_selection import train_test_split
#from np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
from nltk import word_tokenize


In [58]:
# Create and train a Multinomial Naive Bayes classifier.
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)


In [62]:
y_test

1198    0
526     1
393     1
1407    0
433     1
       ..
1793    0
1111    0
693     1
1492    0
921     1
Name: target, Length: 403, dtype: int64

In [59]:
y_pred = classifier.predict(X_test_vec)

In [61]:
len(y_pred)

403

In [47]:
accuracy = accuracy_score(y_test, y_pred)

In [48]:
accuracy

1.0

In [63]:
y_pred

array([0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1,
       0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1,

In [78]:
vectorizer.vocabulary_

{'deteriorating': 6442,
 'cleanliness': 3559,
 'pond': 17557,
 'causing': 2924,
 'spread': 23179,
 'waterborne': 27014,
 'diseases': 6805,
 'affecting': 699,
 'health': 10442,
 'residents': 20278,
 'deteriorating cleanliness': 6443,
 'cleanliness pond': 3590,
 'pond causing': 17571,
 'causing spread': 3079,
 'spread waterborne': 23185,
 'waterborne diseases': 27015,
 'diseases affecting': 6806,
 'affecting health': 748,
 'health residents': 10518,
 'deteriorating cleanliness pond': 6450,
 'cleanliness pond causing': 3593,
 'pond causing spread': 17573,
 'causing spread waterborne': 3081,
 'spread waterborne diseases': 23186,
 'waterborne diseases affecting': 27016,
 'diseases affecting health': 6807,
 'affecting health residents': 750,
 'drains': 7119,
 'good': 9778,
 'cant': 2752,
 'take': 24260,
 'away': 2009,
 'water': 26647,
 'rains': 19098,
 'thats': 24482,
 'big': 2362,
 'problem': 18274,
 'drains good': 7126,
 'good cant': 9779,
 'cant take': 2780,
 'take away': 24275,
 'away wa

In [113]:
def predict():
    message=input("enter your Grievance: ")
    df=pd.DataFrame({'message':[message]})
    df['message']=df['message'].apply(clean_text)
    message=df['message']
    message=vectorizer.transform(message)
    y=classifier.predict(message)
    #L=['Dept of road, transport and highways','Dept of Jal shakti']
    return y[0]
    #return y
    

In [96]:
predict()

enter your Grievance: clean river ganga project is very slow, please clean the river asap


'Dept of Jal shakti'

In [114]:
predict()

enter your Grievance: the toll prices are very high, please reduce the toll prices


0

In [98]:
predict()

enter your Grievance: there are too many potholes on the road, please fix these potholes asap


'Dept of road, transport and highways'

In [99]:
predict()

enter your Grievance: the area around water storage places is very unclean and untidy, please clean those areas


'Dept of Jal shakti'

In [100]:
predict()

enter your Grievance: the toll prices are very high


'Dept of road, transport and highways'

In [101]:
predict()


enter your Grievance: i was wrongfully charged , i did not break any traffic signals


'Dept of road, transport and highways'

In [102]:
predict()

enter your Grievance: the water supply to my house is very less, my house requires more water supply


'Dept of Jal shakti'

In [103]:
predict()

enter your Grievance: I hope you'll take our concerns about the missing street lights seriously. It's affecting our safety and quality of life.


'Dept of road, transport and highways'

In [104]:
classifier

In [106]:
import pickle

In [107]:
with open("dept_MODEL_NB.pkl",'wb') as f:
    pickle.dump(classifier,f)

In [108]:
model=pickle.load(open("dept_MODEL_NB.pkl",'rb'))

In [109]:
model

In [111]:
vectorizer

In [112]:
with open("vectorizer.pkl",'wb') as file:
    pickle.dump(vectorizer,file)