In [35]:
import pandas as pd
import nltk

In [36]:
#Importing the csv file
news=pd.read_csv('news.csv')

In [37]:
#Exploratory Data Analysis
news.head()

Unnamed: 0,Category,News
0,ca,"The company put in an aggressive bid of Rs 6,0..."
1,notca,Reliance Capital today announced the appointme...
2,notca,Reports have emerged that Deloitte - the statu...
3,ca,HG Infra Engineering today fixed a price band ...
4,ca,US private equity firm Blackstone Group LP is ...


In [38]:
#Exploratory Data Analysis
news.describe()

Unnamed: 0,Category,News
count,125,125
unique,2,125
top,ca,Man Industries (India) has bought Zero Coupon ...
freq,93,1


In [39]:
#Exploratory Data Analysis
news.groupby('Category').describe()

Unnamed: 0_level_0,News,News,News,News
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ca,93,93,Man Industries (India) has bought Zero Coupon ...,1
notca,32,32,"Ashok Haldia, MD & CEO, PTC India Financial Se...",1


In [40]:
import string
from nltk.corpus import stopwords

In [41]:
#Function to clean the text and pre processing it
def text_process(text):
    #Remove the punctuation
    #Remove the stopwords
    #Return list of clean text
    text=''.join([char for char in text if char not in string.punctuation])
    return [word for word in text.split() if word.lower() not in stopwords.words('english')]

In [42]:
#Verifying if the text processing is working
news['News'].head(5).apply(text_process)

0    [company, put, aggressive, bid, Rs, 6000, cror...
1    [Reliance, Capital, today, announced, appointm...
2    [Reports, emerged, Deloitte, statutory, audito...
3    [HG, Infra, Engineering, today, fixed, price, ...
4    [US, private, equity, firm, Blackstone, Group,...
Name: News, dtype: object

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [44]:
#Setting up the pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [69]:
#Training and Test split
news_train,news_test,category_train,category_test=train_test_split(news['News'],news['Category'],test_size=0.4)
print(len(news_train), len(news_test), len(news_train) + len(news_test))

75 50 125


In [70]:
pipeline.fit(news_train,category_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x12533cca0>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [71]:
predictions = pipeline.predict(news_test)

In [72]:
from sklearn.metrics import classification_report
print(classification_report(category_test,predictions,))

precision    recall  f1-score   support

          ca       0.70      1.00      0.82        35
       notca       0.00      0.00      0.00        15

    accuracy                           0.70        50
   macro avg       0.35      0.50      0.41        50
weighted avg       0.49      0.70      0.58        50

