In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
#nltk.download('stopwords')
stop = stopwords.words('english')

In [2]:
df=pd.read_csv("/home/ghost/Jupyter notebook/NLP project/news_dump.csv")
Tags=pd.read_csv("/home/ghost/Jupyter notebook/NLP project/tag_list.csv")

In [3]:
df.head()

Unnamed: 0,newsID,title,date,media_house,tags
0,47709,[email protected],2018-09-06,Tehelka,
1,47707,Japan rocked by magnitude 6.7 earthquake,2018-09-06,Tehelka,Earthquake
2,47708,Veteran actor Dilip Kumar admitted to hospital...,2018-09-06,Tehelka,Hospital
3,47706,Peshawar-Lahore-London: This novel follows fou...,2018-09-06,scroll.in,Afghanistan|London
4,47700,"If Pakistan stops terrorism, we too will act l...",2018-09-06,scroll.in,Terrorism|Pakistan|Terror


In [4]:
df.shape

(47628, 5)

In [5]:
df.dropna(inplace=True)

In [6]:
df.shape

(24841, 5)

In [7]:
# to see more title length
#pd.options.display.max_colwidth=100        

# Removing hindi data

In [8]:
from langdetect import detect
df['lang']=df['title'].apply(detect)

In [9]:
df=df[df['lang']!='hi']

In [10]:
#convert content column to lower case
df['title']=df['title'].apply(lambda x: x.lower())

In [11]:
#converting tags in df as list (format=[tag1,tag2,...])
tg=[]
for i in df['tags']:
    tg.append(i.split("|"))

tags=[]
for i in tg:
    tags.extend(i)

tags=pd.unique(tags).tolist()

In [12]:
# converting tag names in lower letters
for i in range(len(tags)):
    tags[i]=tags[i].lower()

In [13]:
tg=[[j.lower() for j in i] for i in tg]                    #converting tags in lowercase

In [14]:
df['tags']=tg

In [15]:
df.head()

Unnamed: 0,newsID,title,date,media_house,tags,lang
1,47707,japan rocked by magnitude 6.7 earthquake,2018-09-06,Tehelka,[earthquake],en
2,47708,veteran actor dilip kumar admitted to hospital...,2018-09-06,Tehelka,[hospital],en
3,47706,peshawar-lahore-london: this novel follows fou...,2018-09-06,scroll.in,"[afghanistan, london]",en
4,47700,"if pakistan stops terrorism, we too will act l...",2018-09-06,scroll.in,"[terrorism, pakistan, terror]",en
5,47701,‘motive was to get gallantry medal’: cbi indic...,2018-09-06,scroll.in,"[police, murder]",en


In [16]:
#split sentence by space
tokenizer = RegexpTokenizer(r'\w+')
df['tokenized_sents'] = df.apply(lambda row: tokenizer.tokenize(row['title']), axis=1)

In [17]:
#removing stopwords
df['tokenized_sents'] = df['tokenized_sents'].apply(lambda x: [item for item in x if item not in stop])

# removing numbers
df['tokenized_sents'] = df['tokenized_sents'].apply(lambda x: [item for item in x if not item.isdigit()])

#to join tokens together as one sentence
df['tokenized_sents'] = df['tokenized_sents'].apply(lambda x: " ".join(x))

In [18]:
df.head()

Unnamed: 0,newsID,title,date,media_house,tags,lang,tokenized_sents
1,47707,japan rocked by magnitude 6.7 earthquake,2018-09-06,Tehelka,[earthquake],en,japan rocked magnitude earthquake
2,47708,veteran actor dilip kumar admitted to hospital...,2018-09-06,Tehelka,[hospital],en,veteran actor dilip kumar admitted hospital du...
3,47706,peshawar-lahore-london: this novel follows fou...,2018-09-06,scroll.in,"[afghanistan, london]",en,peshawar lahore london novel follows four sist...
4,47700,"if pakistan stops terrorism, we too will act l...",2018-09-06,scroll.in,"[terrorism, pakistan, terror]",en,pakistan stops terrorism act like neeraj chopr...
5,47701,‘motive was to get gallantry medal’: cbi indic...,2018-09-06,scroll.in,"[police, murder]",en,motive get gallantry medal cbi indicts manipur...


## One Hot Coding

In [19]:
from sklearn.preprocessing import MultiLabelBinarizer

In [20]:
MLB=MultiLabelBinarizer(classes=tags)

In [21]:
OHC_array=MLB.fit_transform(df['tags'])

In [22]:
OHC=pd.DataFrame(OHC_array,columns=tags)

In [23]:
print(OHC.shape)

print(df.shape)

(24698, 397)
(24698, 7)


In [24]:
print(df.index[-1])

print(OHC.index[-1])

# both Dataframe has different indexes

47627
24697


In [25]:
# merging both dataframes
Data=pd.concat([df.set_index(OHC.index), OHC], axis=1, sort=False)   # ".set_index function change or set index of a Dataframe"

In [26]:
Data.shape

(24698, 404)

In [27]:
Data

Unnamed: 0,newsID,title,date,media_house,tags,lang,tokenized_sents,earthquake,hospital,afghanistan,...,nitin gadakari,venkaih naidu,crime by minors,indian stock market,foreign relations,samajwadi party (sp),bahujan samajwadi party (bsp),crime against childrens,barrak valley,dynastic politics
0,47707,japan rocked by magnitude 6.7 earthquake,2018-09-06,Tehelka,[earthquake],en,japan rocked magnitude earthquake,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,47708,veteran actor dilip kumar admitted to hospital...,2018-09-06,Tehelka,[hospital],en,veteran actor dilip kumar admitted hospital du...,0,1,0,...,0,0,0,0,0,0,0,0,0,0
2,47706,peshawar-lahore-london: this novel follows fou...,2018-09-06,scroll.in,"[afghanistan, london]",en,peshawar lahore london novel follows four sist...,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,47700,"if pakistan stops terrorism, we too will act l...",2018-09-06,scroll.in,"[terrorism, pakistan, terror]",en,pakistan stops terrorism act like neeraj chopr...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,47701,‘motive was to get gallantry medal’: cbi indic...,2018-09-06,scroll.in,"[police, murder]",en,motive get gallantry medal cbi indicts manipur...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,47702,interview: ‘the government is saying that anyo...,2018-09-06,scroll.in,"[dalit, government]",en,interview government saying anyone supports da...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,47704,‘i’m an incurable romantic’: shefali shah on t...,2018-09-06,scroll.in,[netflix],en,incurable romantic shefali shah tackling love ...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,47705,in ‘happy bubble’ of sonipat’s elite universit...,2018-09-06,scroll.in,[student],en,happy bubble sonipat elite universities concer...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,47697,cover story in the spotlight the daily fix: de...,2018-09-06,scroll.in,"[delhi, farmer]",en,cover story spotlight daily fix delhi rally fa...,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,47698,the big news: supreme court to decide on valid...,2018-09-06,scroll.in,"[supreme court, 377]",en,big news supreme court decide validity section...,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Splitting train and test data

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
x_train,x_test,y_train,y_test=train_test_split(Data['title'],Data.drop(['newsID','title','tokenized_sents','tags','date','media_house','lang'],axis=1),test_size=0.33, random_state=42)

## building model

In [33]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score

## Model 1

In [34]:
from sklearn.linear_model import LogisticRegression


In [35]:
classifier1 = Pipeline([('vectorizer', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', OneVsRestClassifier(LogisticRegression()))])

In [36]:
classifier1.fit(x_train,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...te=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False),
          n_jobs=None))])

In [37]:
predicted1 = classifier1.predict(x_test)

In [38]:
print(accuracy_score(y_test, predicted1))

0.42043920991289413


## Model 2

In [39]:
from sklearn.tree import DecisionTreeClassifier

In [40]:
classifier2 = Pipeline([('vectorizer', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', OneVsRestClassifier(DecisionTreeClassifier()))])

In [41]:
classifier2.fit(x_train,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...n_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=None))])

In [42]:
predicted2 = classifier2.predict(x_test)

In [43]:
print(accuracy_score(y_test, predicted2))

0.9528892160471107


## Model 3

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
classifier3 = Pipeline([('vectorizer', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', OneVsRestClassifier(DecisionTreeClassifier()))])

In [46]:
classifier3.fit(x_train,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...n_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          n_jobs=None))])

In [47]:
predicted3 = classifier3.predict(x_test)

In [48]:
print(accuracy_score(y_test, predicted3))

0.9530119003803215


## Model 4

In [49]:
from sklearn.svm import LinearSVC

In [50]:
classifier4 = Pipeline([('vectorizer', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', OneVsRestClassifier(LinearSVC()))])

In [51]:
classifier4.fit(x_train,y_train)

  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))
  str(classes[c]))


Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ..._class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
          n_jobs=None))])

## Predictions and metrics

In [52]:
predicted4 = classifier4.predict(x_test)

In [53]:
print(accuracy_score(y_test, predicted4))

0.9228315544105018


# Decision tree algorithm has highest accuracy