In [26]:
#importing Modules

import re
import pandas as pd
import json
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#For Data cleansing of HTML text

def clean_text(text):    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)
    text = re.sub(r"\xa0", "", text)
    
    # convert text to lowercase
    text = text.strip().lower()
    
    # replace punctuation characters with spaces
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)

    return text

In [3]:
#Reading and Deviding data into train and test

df = pd.read_json("train.json")
X_train, X_test, y_train, y_test = train_test_split(df['petition_description'],df['petition_category'] , test_size=0.10, random_state=42)

In [42]:
# Transform each text into a vector of word counts

vectorizer = CountVectorizer(stop_words="english",
                             preprocessor=clean_text)

In [43]:
# Generating Vectors from text
training_features = vectorizer.fit_transform(X_train)    
test_features = vectorizer.transform(X_test)


In [59]:
# Training with Linear support vector Machine
model = LinearSVC(random_state=42)
model

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [60]:
#Training model with training features

model.fit(training_features, y_train)
y_pred = model.predict(test_features)



In [106]:
# Evaluation for accuracy
acc = accuracy_score(y_test, y_pred)
print("Accuracy with Linear SVM is ", acc*100)


Accuracy with Linear SVM is  92.24397590361446


In [62]:
#Choosing another approach XGBoost Classifier with different parameters

model_Boost = XGBClassifier(random_state=32,learning_rate=0.05,max_depth=3,booster='gbtree',objective='multi:softmax')


In [63]:
#Training model with training features

model_Boost.fit(training_features, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='multi:softprob', random_state=32,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [107]:
y_pred = model_Boost.predict(test_features)
acc = accuracy_score(y_test, y_pred)
print("Accuracy with XGBoost Classifier is ", acc*100)


Accuracy with XGBoost Classifier is  92.24397590361446


In [88]:
#Checking algorithm with live real data from website

sample = """Hello,

We at F-Block Rama Vihar Street No-13 Near Omkar Dham Mandir Delhi 110081, almost 1000+ members of middle-classs families living in apartments developed by a builder. These apartment flats were bought by these families from their lifetime savings from their hardship earning. However, life here is very challenging because of lack of proper roads and sewage disposal facilities.

There is no sewage or drainage system in the locality. Sewage water stagnation near our homes is a big threat to the health of children and elders in these colonies. We need both storm water drainage and sewage drainage systems to be put in place.

Please Provide us drinking water, drainage, and road facility in F-Block Rama Vihar Street No-13 Near Omkar Dham Mandir Delhi 110081

We request the corporation commissioner to take immediate action on this issue and provide us roads, drinking water and drainage.

Sincerely,
Neeraj Yadav

"""

In [91]:
#Predicting with XGBoost Classifier
sampleOutput = model_Boost.predict(vectorizer.transform([sample]))
sampleOutput

array(['health care'], dtype=object)

In [92]:
#Predicting with Linear SVM

sampleOutput1 = model.predict(vectorizer.transform([sample]))
sampleOutput1

#Here XGBoost Algo is giving more accuracy and correct predictions

array(['environment issue'], dtype=object)

In [93]:
#Working with validation set using XGBoost Classifier

valid=pd.read_json('validation.json')
validation=valid['petition_description']   
valid_features = vectorizer.transform(validation)
valOutput = model_Boost.predict(valid_features)
valOutput




array(['tax', 'tax', 'tax', ..., 'infrastructure', 'infrastructure',
       'infrastructure'], dtype=object)

In [104]:
output['petition_category']=pd.DataFrame(valOutput, columns=['petition_category'])

output['petition_id']=valid['petition_id']

output.to_csv('Cout1.csv', sep=',',index=False)