In [46]:
import pandas as pd
import nltk

In [47]:
#Importing the csv file
news=pd.read_csv('../news.csv')

In [48]:
#Exploratory Data Analysis
news.head()

Unnamed: 0,Category,News
0,ca,"The company put in an aggressive bid of Rs 6,0..."
1,notca,Reliance Capital today announced the appointme...
2,notca,Reports have emerged that Deloitte - the statu...
3,ca,HG Infra Engineering today fixed a price band ...
4,ca,US private equity firm Blackstone Group LP is ...


In [49]:
#Exploratory Data Analysis
news.describe()

Unnamed: 0,Category,News
count,1061,1061
unique,2,940
top,notca,The private sector bank posted a net loss of R...
freq,632,4


In [50]:
#Exploratory Data Analysis
news.groupby('Category').describe()

Unnamed: 0_level_0,News,News,News,News
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ca,429,411,"The record date has been fixed at October 7 , ...",4
notca,632,529,The private sector bank posted a net loss of R...,4


In [51]:
import string
from nltk.corpus import stopwords

In [52]:
#Function to clean the text and pre processing it
def text_process(text):
    #Remove the punctuation
    #Remove the stopwords
    #Return list of clean text
    text=''.join([char for char in text if char not in string.punctuation])
    return [word for word in text.split() if word.lower() not in stopwords.words('english')]

In [53]:
#Verifying if the text processing is working
news['News'].head(5).apply(text_process)

0    [company, put, aggressive, bid, Rs, 6000, cror...
1    [Reliance, Capital, today, announced, appointm...
2    [Reports, emerged, Deloitte, statutory, audito...
3    [HG, Infra, Engineering, today, fixed, price, ...
4    [US, private, equity, firm, Blackstone, Group,...
Name: News, dtype: object

In [54]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [55]:
#Setting up the pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [56]:
#Training and Test split
news_train,news_test,category_train,category_test=train_test_split(news['News'],news['Category'],test_size=0.4)
print(len(news_train), len(news_test), len(news_train) + len(news_test))

636 425 1061


In [57]:
pipeline.fit(news_train,category_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x12206c670>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [58]:
type(news_test)

pandas.core.series.Series

In [59]:
predictions = pipeline.predict(pd.Series('The government may soon ban PlayerUnknowns Battlegrounds (PUBG) in India. Reports suggesting a ban on 275 apps have started to surface online a month after the Indian government banned 59 Chinese apps, including popular ones like TikTok, UC Browser, citing data security concerns.'))

In [60]:
print(predictions)

['notca']


In [61]:
predictions = pipeline.predict(pd.Series('The board of Pfizer on Monday declared a special dividend of ₹320 a share of ₹10 each (3,200 per cent ) for the financial year ended March 31, 2020. The dividend will be paid to the eligible members of the company (whose names appear in the register of members as on the record date May 8) electronically through bank transfer on May 19.'))

In [62]:
print(predictions)

['ca']


In [63]:
predictions=pipeline.predict(news_test)

In [64]:
from sklearn.metrics import classification_report
print(classification_report(category_test,predictions,))

precision    recall  f1-score   support

          ca       0.98      0.93      0.96       168
       notca       0.96      0.99      0.97       257

    accuracy                           0.97       425
   macro avg       0.97      0.96      0.97       425
weighted avg       0.97      0.97      0.97       425

