In [1]:
import pandas as pd
import nltk

In [2]:
#Importing the csv file
news=pd.read_csv('news.csv')

In [3]:
#Exploratory Data Analysis
news.head()

Unnamed: 0,Category,News
0,ca,"The company put in an aggressive bid of Rs 6,0..."
1,notca,Reliance Capital today announced the appointme...
2,notca,Reports have emerged that Deloitte - the statu...
3,ca,HG Infra Engineering today fixed a price band ...
4,ca,US private equity firm Blackstone Group LP is ...


In [4]:
#Exploratory Data Analysis
news.describe()

Unnamed: 0,Category,News
count,175,175
unique,2,175
top,ca,Ashok Leyland is quoting ex-bonus today. The c...
freq,93,1


In [5]:
#Exploratory Data Analysis
news.groupby('Category').describe()

Unnamed: 0_level_0,News,News,News,News
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ca,93,93,Zylog Systems is quoting ex-split today. The c...,1
notca,82,82,RBI has approved the restructuring proposal of...,1


In [6]:
import string
from nltk.corpus import stopwords

In [7]:
#Function to clean the text and pre processing it
def text_process(text):
    #Remove the punctuation
    #Remove the stopwords
    #Return list of clean text
    text=''.join([char for char in text if char not in string.punctuation])
    return [word for word in text.split() if word.lower() not in stopwords.words('english')]

In [8]:
#Verifying if the text processing is working
news['News'].head(5).apply(text_process)

0    [company, put, aggressive, bid, Rs, 6000, cror...
1    [Reliance, Capital, today, announced, appointm...
2    [Reports, emerged, Deloitte, statutory, audito...
3    [HG, Infra, Engineering, today, fixed, price, ...
4    [US, private, equity, firm, Blackstone, Group,...
Name: News, dtype: object

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [10]:
#Setting up the pipeline
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [11]:
#Training and Test split
news_train,news_test,category_train,category_test=train_test_split(news['News'],news['Category'],test_size=0.4)
print(len(news_train), len(news_test), len(news_train) + len(news_test))

105 70 175


In [12]:
pipeline.fit(news_train,category_train)

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x123841f70>)),
                ('tfidf', TfidfTransformer()),
                ('classifier', MultinomialNB())])

In [13]:
predictions = pipeline.predict(news_test)

In [14]:
from sklearn.metrics import classification_report
print(classification_report(category_test,predictions,))

precision    recall  f1-score   support

          ca       0.76      1.00      0.86        37
       notca       1.00      0.64      0.78        33

    accuracy                           0.83        70
   macro avg       0.88      0.82      0.82        70
weighted avg       0.87      0.83      0.82        70

