In [141]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop = stopwords.words('english')

In [142]:
df=pd.read_csv("/home/ghost/Jupyter notebook/NLP project/news_dump.csv")
Tags=pd.read_csv("/home/ghost/Jupyter notebook/NLP project/tag_list.csv")

In [143]:
df.dropna(inplace=True)

In [144]:
 pd.options.display.max_colwidth=100

# Removing hindi data

In [145]:
from langdetect import detect
df['lang']=df['title'].apply(detect)

In [146]:
df=df[df['lang']!='hi']

In [147]:
#convert content column to lower case
df['title']=df['title'].apply(lambda x: x.lower())

In [148]:
#converting tags in df as list (format=[tag1,tag2,...])
tg=[]
for i in df['tags']:
    tg.append(i.split("|"))

tags=[]
for i in tg:
    tags.extend(i)

tags=pd.unique(tags).tolist()

In [149]:
for i in range(len(tags)):
    tags[i]=tags[i].lower()

In [150]:
tg=[[j.lower() for j in i] for i in tg]                    #converting tags in lowercase

In [151]:
df['tags']=tg

In [152]:
#split sentence by space
tokenizer = RegexpTokenizer(r'\w+')
df['tokenized_sents'] = df.apply(lambda row: tokenizer.tokenize(row['title']), axis=1)

In [153]:
#removing stopwords
df['tokenized_sents'] = df['tokenized_sents'].apply(lambda x: [item for item in x if item not in stop])

# remove numbers
df['tokenized_sents'] = df['tokenized_sents'].apply(lambda x: [item for item in x if not item.isdigit()])

#to join tokens together as one sentence
df['tokenized_sents'] = df['tokenized_sents'].apply(lambda x: " ".join(x))

## One Hot Coding

In [154]:
from sklearn.preprocessing import MultiLabelBinarizer

In [155]:
MLB=MultiLabelBinarizer(classes=tags)

In [156]:
OHC_array=MLB.fit_transform(df['tags'])

In [157]:
OHC=pd.DataFrame(OHC_array,columns=tags)

In [158]:
print(OHC.shape)

print(df.shape)

(24698, 397)
(24698, 7)


In [159]:
print(df.index[-1])

print(OHC.index[-1])

# both Dataframe has different indexes

47627
24697


In [160]:
Data=pd.concat([df.set_index(OHC.index), OHC], axis=1, sort=False)   # ".set_index function change or set index of a Dataframe"

In [161]:
Data.shape

(24698, 404)

In [162]:
Data

Unnamed: 0,newsID,title,date,media_house,tags,lang,tokenized_sents,earthquake,hospital,afghanistan,...,nitin gadakari,venkaih naidu,crime by minors,indian stock market,foreign relations,samajwadi party (sp),bahujan samajwadi party (bsp),crime against childrens,barrak valley,dynastic politics
0,47707,japan rocked by magnitude 6.7 earthquake,2018-09-06,Tehelka,[earthquake],en,japan rocked magnitude earthquake,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,47708,veteran actor dilip kumar admitted to hospital due to chest infection,2018-09-06,Tehelka,[hospital],en,veteran actor dilip kumar admitted hospital due chest infection,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,47706,peshawar-lahore-london: this novel follows four sisters who have to shake off past family glory ...,2018-09-06,scroll.in,"[afghanistan, london]",en,peshawar lahore london novel follows four sisters shake past family glory najma yusufi first nov...,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,47700,"if pakistan stops terrorism, we too will act like neeraj chopra, says army chief bipin rawat 7 m...",2018-09-06,scroll.in,"[terrorism, pakistan, terror]",en,pakistan stops terrorism act like neeraj chopra says army chief bipin rawat minutes ago,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,47701,‘motive was to get gallantry medal’: cbi indicts manipur police for ‘cold blooded murder’,2018-09-06,scroll.in,"[police, murder]",en,motive get gallantry medal cbi indicts manipur police cold blooded murder,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,47702,interview: ‘the government is saying that anyone who supports dalits will be jailed’,2018-09-06,scroll.in,"[dalit, government]",en,interview government saying anyone supports dalits jailed,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,47704,"‘i’m an incurable romantic’: shefali shah on tackling love at all ages in ‘once again’ the film,...",2018-09-06,scroll.in,[netflix],en,incurable romantic shefali shah tackling love ages film co starring neeraj kabi released netflix...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,47705,"in ‘happy bubble’ of sonipat’s elite universities, concerns persist about free speech and divers...",2018-09-06,scroll.in,[student],en,happy bubble sonipat elite universities concerns persist free speech diversity overwhelming majo...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,47697,cover story in the spotlight the daily fix: delhi rally by farmers and workers shines light on t...,2018-09-06,scroll.in,"[delhi, farmer]",en,cover story spotlight daily fix delhi rally farmers workers shines light need minimum wage shoai...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,47698,"the big news: supreme court to decide on validity of section 377 today, and 9 other top stories ...",2018-09-06,scroll.in,"[supreme court, 377]",en,big news supreme court decide validity section today top stories pinned,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Splitting train and test data

In [163]:
from sklearn.model_selection import train_test_split

In [164]:
x_train,x_test,y_train,y_test=train_test_split(Data['title'],Data.drop(['newsID','title','tokenized_sents','tags','date','media_house','lang'],axis=1),test_size=0.33, random_state=42)

## building model

In [165]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

## Model 1

In [166]:
from sklearn.linear_model import LogisticRegression


In [167]:
classifier1 = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(LogisticRegression()))])

In [168]:
classifier1.fit(x_train,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...te=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])

In [169]:
predicted1 = classifier1.predict(x_test)

In [170]:
print(accuracy_score(y_test, predicted1))

0.42043920991289413


## Model 2

In [171]:
from sklearn.tree import DecisionTreeClassifier

In [172]:
classifier2 = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(DecisionTreeClassifier()))])

In [173]:
classifier2.fit(x_train,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...n_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=None))])

In [174]:
predicted2 = classifier2.predict(x_test)

In [175]:
print(accuracy_score(y_test, predicted2))

0.9520304257146363


## Model 3

In [176]:
from sklearn.ensemble import RandomForestClassifier

In [177]:
classifier3 = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(DecisionTreeClassifier()))])

In [178]:
classifier3.fit(x_train,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...n_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=None))])

In [179]:
predicted3 = classifier3.predict(x_test)

In [180]:
print(accuracy_score(y_test, predicted3))

0.9504355293828978


## Model 4

In [186]:
from sklearn.svm import LinearSVC

In [187]:
classifier4 = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(LinearSVC()))])

In [188]:
classifier4.fit(x_train,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ..._class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None))])

## Predictions and metrics

In [189]:
predicted4 = classifier4.predict(x_test)

In [190]:
print(accuracy_score(y_test, predicted4))

0.9228315544105018


# Decision tree algorithm has highest accuracy