In [1]:
import pandas as pd
import random
import nltk
import string
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.corpus import wordnet
import re

In [2]:
df = pd.read_csv('data.csv')
df.drop(columns='Unnamed: 0', inplace = True)
df

Unnamed: 0,Text,Category,Time,Location
0,"Shri. Adesh Gupta ji President, BJP Delhi has ...",accident,2018-01-01,Laxmi Nagar
1,"Murders, rapes, cyber crime: How Covid affecte...",cyber crime,2018-01-01,Bhajanpura
2,The father killed his 10-year-old son along wi...,murder,2018-01-01,Mundka
3,RT @bainsindian: Madam Shiv Senni ho gayi ho🤔\...,accident,2018-01-01,Sadar Bazar
4,» Average 77 #rape cases daily reported in #In...,crime against women,2018-01-01,Shakarpur
...,...,...,...,...
24510,Respected @narendramodi ji we consider u as ou...,accident,2018-12-31,Nihal Vihar
24511,RT @matrixxmedia: An Indian woman allegedly as...,murder,2018-12-31,Nihal Vihar
24512,Headless Body of Man Found in Plastic Bag in N...,murder,2018-12-31,Bawana
24513,@DigitalShakti @NCWIndia @Facebook @AutobotInf...,cyber crime,2018-12-31,Bhajanpura


In [3]:
wordnet_lemmatizer = WordNetLemmatizer()
stopwords = nltk.corpus.stopwords.words('english')
stopwords+=list(string.punctuation)
stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [4]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def preprocess(sent):
    clean_text = []
    text = sent.split(" ")
    for i,s in enumerate(text):
        if s.startswith('http') or s.startswith('@'):
            text[i] = "";
    if(text[0]=='RT'):
        text[0]=""
        text[1]=""
    sent = " ".join(text)
    sent.strip()
#     print(sent)
    words = word_tokenize(sent)
    pos = pos_tag(words)
    for i,word in enumerate(words):
        if word not in stopwords or word.lower() not in stopwords:
            clean_word = wordnet_lemmatizer.lemmatize(word,pos=get_simple_pos(pos[i][1]))
            clean_text.append(clean_word.lower())
    return " ".join(clean_text)

In [5]:
df['clean_text'] = df['Text'].apply(lambda x:preprocess(x))
df

Unnamed: 0,Text,Category,Time,Location,clean_text
0,"Shri. Adesh Gupta ji President, BJP Delhi has ...",accident,2018-01-01,Laxmi Nagar,shri adesh gupta ji president bjp delhi presen...
1,"Murders, rapes, cyber crime: How Covid affecte...",cyber crime,2018-01-01,Bhajanpura,murders rape cyber crime how covid affect crim...
2,The father killed his 10-year-old son along wi...,murder,2018-01-01,Mundka,the father kill 10-year-old son along wife gir...
3,RT @bainsindian: Madam Shiv Senni ho gayi ho🤔\...,accident,2018-01-01,Sadar Bazar,madam shiv senni ho gayi ho🤔 mea already reach...
4,» Average 77 #rape cases daily reported in #In...,crime against women,2018-01-01,Shakarpur,» average 77 rape case daily report india 2020...
...,...,...,...,...,...
24510,Respected @narendramodi ji we consider u as ou...,accident,2018-12-31,Nihal Vihar,respected ji consider u head head family i ’ a...
24511,RT @matrixxmedia: An Indian woman allegedly as...,murder,2018-12-31,Nihal Vihar,an indian woman allegedly assault rap mumbai f...
24512,Headless Body of Man Found in Plastic Bag in N...,murder,2018-12-31,Bawana,headless body man found plastic bag navi mumba...
24513,@DigitalShakti @NCWIndia @Facebook @AutobotInf...,cyber crime,2018-12-31,Bhajanpura,


In [7]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['Category'], random_state = 0)
tfidf_transformer = TfidfVectorizer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train)

# Naive Bayes

In [9]:
nb = MultinomialNB().fit(X_train_tfidf, y_train)
X_test_tfidf = tfidf_transformer.transform(X_test)
print('Training Accuracy: ',nb.score(X_train_tfidf, y_train),' Testing Accuracy: ',nb.score(X_test_tfidf, y_test))
# y_pred = clf.predict(X_test_tfidf)
# plot_confusion_matrix(clf,y_test,y_pred)
# plt.show()

Training Accuracy:  0.9607309909713913  Testing Accuracy:  0.957741882852015


# Logistic Regression

In [15]:
lr = LogisticRegression(solver='liblinear', C=10.0).fit(X_train_tfidf, y_train)
X_test_tfidf = tfidf_transformer.transform(X_test)
print('Training Accuracy: ',lr.score(X_train_tfidf, y_train),' Testing Accuracy: ',lr.score(X_test_tfidf, y_test))

Training Accuracy:  0.9781355379092788  Testing Accuracy:  0.9751998694729972


# Decision Tree

In [13]:
dt = DecisionTreeClassifier().fit(X_train_tfidf, y_train)
X_test_tfidf = tfidf_transformer.transform(X_test)
print('Training Accuracy: ',dt.score(X_train_tfidf, y_train),' Testing Accuracy: ',dt.score(X_test_tfidf, y_test))

Training Accuracy:  0.9781355379092788  Testing Accuracy:  0.973568281938326


# Random Forest

In [11]:
rf = RandomForestClassifier().fit(X_train_tfidf, y_train)
X_test_tfidf = tfidf_transformer.transform(X_test)
print('Training Accuracy: ',rf.score(X_train_tfidf, y_train),' Testing Accuracy: ',rf.score(X_test_tfidf, y_test))

Training Accuracy:  0.9781355379092788  Testing Accuracy:  0.9745472344591287


# SVM

In [12]:
svm = SVC().fit(X_train_tfidf, y_train)
X_test_tfidf = tfidf_transformer.transform(X_test)
print('Training Accuracy: ',svm.score(X_train_tfidf, y_train),' Testing Accuracy: ',svm.score(X_test_tfidf, y_test))

Training Accuracy:  0.9780811487000979  Testing Accuracy:  0.9755261869799314


In [16]:
# trend basis - All except murder
# location basis = murder