# News Category Hackathon Solution

In [17]:
import pandas as pd

In [18]:
train_data = pd.read_excel("Data_Train.xlsx")

## Exploring Data

In [19]:
train_data.head(10)

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ‘...",1
4,"In global markets, gold prices edged up today ...",3
5,BEIJING: Chinese tech giant Huawei has announc...,1
6,Mumbai: India Inc's external commercial borrow...,3
7,"On Wednesday, Federal Reserve Chairman Jerome ...",3
8,What more can you give to the audience? I have...,2
9,"com, Arbaaz Khan spoke about getting back to D...",2


In [20]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7628 entries, 0 to 7627
Data columns (total 2 columns):
STORY      7628 non-null object
SECTION    7628 non-null int64
dtypes: int64(1), object(1)
memory usage: 119.3+ KB


In [21]:
train_data.SECTION = train_data.SECTION.astype('str')

In [22]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7628 entries, 0 to 7627
Data columns (total 2 columns):
STORY      7628 non-null object
SECTION    7628 non-null object
dtypes: object(2)
memory usage: 119.3+ KB


In [25]:
train_data.describe()

Unnamed: 0,STORY,SECTION
count,7628,7628
unique,7548,4
top,This story has been published from a wire agen...,1
freq,28,2772


In [27]:
train_data.drop_duplicates(inplace = True)

In [34]:
train_data.shape

(7551, 2)

In [36]:
train_data.groupby("SECTION").describe()

Unnamed: 0_level_0,STORY,STORY,STORY,STORY
Unnamed: 0_level_1,count,unique,top,freq
SECTION,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,1673,1673,Residents of Bhopal say industrial growth and ...,1
1,2731,2731,Amazon Echo integration is also present in Aud...,1
2,1914,1914,The 37-year-old actor said working on the thre...,1
3,1233,1233,Demand for shorter-maturity bonds is also high...,1


## Data Preprocessing

In [32]:
import nltk

In [51]:
from nltk.corpus import stopwords
import string

In [266]:
#stopwords.words('english')

In [110]:
all_punctuations = string.punctuation + '‘’,:”][],'

In [111]:
def punc_remover(raw_text):
    no_punct = "".join([i for i in raw_text if i not in all_punctuations])
    return no_punct
    

In [145]:
def stopword_remover(no_punc_text):
    words = no_punc_text.split()
    no_stp_words = " ".join([i for i in words if i not in stopwords.words('english')])
    
    return no_stp_words

In [163]:
lemmer = nltk.stem.WordNetLemmatizer()
def lem(words):
    return " ".join([lemmer.lemmatize(word,'v') for word in words.split()])

In [164]:
def text_cleaner(raw):
    cleaned_text = stopword_remover(punc_remover(raw))
    return lem(cleaned_text)


In [114]:
#text_cleaner("Hi! I am Amal. If you dont know about me. Look me up on LinkedIn")

In [267]:
#train_data['STORY'].head(5).apply(text_cleaner).values

In [167]:
train_data['CLEAN_STORY'] = train_data['STORY'].apply(text_cleaner)

In [268]:
#train_data.values

## Count Vectorizing

In [170]:
from sklearn.feature_extraction.text import CountVectorizer

In [195]:
bow_dictionary = CountVectorizer().fit(train_data['CLEAN_STORY'])

In [196]:
len(bow_dictionary.vocabulary_)

35189

In [197]:
bow_dictionary.vocabulary_

{'but': 7721,
 'painful': 23789,
 'huge': 16266,
 'reversal': 27207,
 'fee': 13456,
 'income': 16799,
 'unheard': 32978,
 'among': 4877,
 'private': 25482,
 'sector': 28421,
 'lenders': 19378,
 'essentially': 12752,
 'mean': 20746,
 'yes': 34967,
 'bank': 6228,
 'take': 31197,
 'grant': 15054,
 'structure': 30581,
 'loan': 19661,
 'deal': 10502,
 'pay': 24168,
 'account': 3976,
 'upfront': 33210,
 'book': 7230,
 'as': 5517,
 'borrowers': 7271,
 'turn': 32567,
 'defaulters': 10634,
 'tie': 31924,
 'fell': 13474,
 'crack': 9905,
 'gill': 14729,
 'vow': 34028,
 'shift': 28928,
 'safer': 27802,
 'practice': 25141,
 'amortize': 4884,
 'rather': 26406,
 'gills': 14734,
 'move': 21696,
 'mend': 20897,
 'past': 24070,
 'ways': 34275,
 'nasty': 22199,
 'surprise': 30965,
 'future': 14308,
 'this': 31792,
 'good': 14910,
 'news': 22490,
 'consider': 9493,
 'investors': 17459,
 'love': 19826,
 'clean': 8820,
 'image': 16601,
 'loathe': 19665,
 'uncertainties': 32829,
 'gain': 14392,
 'without': 3

In [184]:
train_data['CLEAN_STORY'][0]

'But painful huge reversal fee income unheard among private sector lenders Essentially mean Yes Bank take grant fee structure loan deal pay account upfront book As borrowers turn defaulters fee tie loan deal fell crack Gill vow shift safer account practice amortize fee income rather book upfront Gills move mend past ways mean nasty surprise future This good news consider investors love clean image loathe uncertainties But gain without pain promise strong stable balance sheet come sacrifice well Investors give hop phenomenal growth promise make Kapoor'

In [190]:
print(bow.transform([train_data['CLEAN_STORY'][0]]).shape)

(1, 35189)


In [192]:
print(bow.transform([train_data['CLEAN_STORY'][0]]))

  (0, 3976)	2
  (0, 4877)	1
  (0, 4884)	1
  (0, 5517)	1
  (0, 6134)	1
  (0, 6228)	1
  (0, 7230)	2
  (0, 7271)	1
  (0, 7721)	2
  (0, 8820)	1
  (0, 9129)	1
  (0, 9493)	1
  (0, 9905)	1
  (0, 10502)	2
  (0, 10634)	1
  (0, 12752)	1
  (0, 13456)	4
  (0, 13474)	1
  (0, 14308)	1
  (0, 14392)	1
  (0, 14729)	1
  (0, 14734)	1
  (0, 14760)	1
  (0, 14910)	1
  (0, 15054)	1
  :	:
  (0, 25482)	1
  (0, 25627)	2
  (0, 26406)	1
  (0, 27207)	1
  (0, 27774)	1
  (0, 27802)	1
  (0, 28421)	1
  (0, 28887)	1
  (0, 28928)	1
  (0, 30191)	1
  (0, 30571)	1
  (0, 30581)	1
  (0, 30965)	1
  (0, 31197)	1
  (0, 31792)	1
  (0, 31924)	1
  (0, 32567)	1
  (0, 32829)	1
  (0, 32978)	1
  (0, 33210)	2
  (0, 34028)	1
  (0, 34275)	1
  (0, 34373)	1
  (0, 34626)	1
  (0, 34967)	1


In [269]:
#bow_dictionary.get_feature_names()[25627]

In [201]:
bow = bow_dictionary.transform(train_data['CLEAN_STORY'])

In [205]:
print(bow.shape)

(7551, 35189)


In [207]:
bow.nnz

414966

In [211]:
sparsity = bow.nnz/(bow.shape[0] * bow.shape[1])
print(sparsity)

0.0015617126171266116


## TFIDF

In [213]:
from sklearn.feature_extraction.text import TfidfTransformer

In [215]:
tfidf_transformer = TfidfTransformer().fit(bow)

In [221]:
tfidf_transformer.idf_[bow_dictionary.vocabulary_['university']]

5.560119855358315

In [222]:
storytfidf = tfidf_transformer.transform(bow)

In [224]:
from sklearn.naive_bayes import MultinomialNB
classfier = MultinomialNB().fit(storytfidf, train_data['SECTION'])

In [264]:
train_p = classfier.predict(storytfidf)

In [265]:
from sklearn.metrics import classification_report

print(classification_report(train_data['SECTION'],train_p))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97      1673
           1       0.94      1.00      0.97      2731
           2       1.00      0.94      0.97      1914
           3       0.99      0.94      0.97      1233

   micro avg       0.97      0.97      0.97      7551
   macro avg       0.97      0.96      0.97      7551
weighted avg       0.97      0.97      0.97      7551



In [232]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(train_p,train_data['SECTION'])

In [239]:
cm

array([[1621,    7,   43,    5],
       [  46, 2718,   66,   67],
       [   2,    4, 1805,    0],
       [   4,    2,    0, 1161]])

In [237]:
acc = cm.diagonal().sum()/cm.sum()

In [240]:
acc

0.9674215335717123

## Predicting For Test Set

In [241]:
test_data = pd.read_excel("Data_Test.xlsx")

In [243]:
test_data.head(10)

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...
5,"""Imagine if every message you sent was kept wi..."
6,Positioned along the four sides of the Asus RO...
7,"In fact, when I applied to USC film school the..."
8,"As spotted by Android Police, Netflix is testi..."
9,Her moves were immaculately choreographed as s...


In [245]:
test_data['CLEAN_STORY'] = test_data['STORY'].apply(text_cleaner)

In [248]:
#test_data.values

In [249]:
from sklearn.pipeline import Pipeline

In [253]:
pipe = Pipeline([
    ('bow', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', MultinomialNB())
])

In [254]:
pipe.fit(train_data['CLEAN_STORY'], train_data['SECTION'])

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [256]:
test_preds = pipe.predict(test_data['CLEAN_STORY'])

In [270]:
test_preds

array(['1', '2', '1', ..., '1', '0', '1'], dtype='<U1')

In [262]:
pd.DataFrame(test_preds, columns = ['SECTION']).to_excel('News_category_soln1.xlsx', index = False)