## AG NEWs classification
This classification problem aims at classifyign news articles- titles and decription, into 4 categories namely:
- 1 - World
- 2- Sports
- 3- Business
- 4 - Sci/Tech
---
classification will use bidirectional LTSM.

In [52]:
# Importing libraries

import pandas as pd
import numpy as np
import tensorflow as tf
import re
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Conv1D, MaxPool1D, Dropout, Dense, GlobalMaxPool1D, Embedding, Activation

In [29]:
# Loading the data
col_names=["class", "Title", "Description"]

df_train = pd.read_csv('data/train.csv', names=col_names)
df_test = pd.read_csv('data/test.csv', names=col_names)

In [30]:
df_train.head()

Unnamed: 0,class,Title,Description
0,3,Wall St. Bears Claw Back Into the Black (Reuters),"Reuters - Short-sellers, Wall Street's dwindli..."
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...,Reuters - Private investment firm Carlyle Grou...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters),Reuters - Soaring crude prices plus worries\ab...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...,Reuters - Authorities have halted oil export\f...
4,3,"Oil prices soar to all-time record, posing new...","AFP - Tearaway world oil prices, toppling reco..."


In [31]:
df_test.head()

Unnamed: 0,class,Title,Description
0,3,Fears for T N pension after talks,Unions representing workers at Turner Newall...
1,4,The Race is On: Second Private Team Sets Launc...,"SPACE.com - TORONTO, Canada -- A second\team o..."
2,4,Ky. Company Wins Grant to Study Peptides (AP),AP - A company founded by a chemistry research...
3,4,Prediction Unit Helps Forecast Wildfires (AP),AP - It's barely dawn when Mike Fitzpatrick st...
4,4,Calif. Aims to Limit Farm-Related Smog (AP),AP - Southern California's smog-fighting agenc...


In [32]:
df_train['summary'] = df_train['Title'] + ' ' + df_train['Description']
df_test['summary'] = df_test['Title'] + ' ' + df_test['Description']

df_train = df_train.drop(columns=['Title', 'Description'])
df_test = df_test.drop(columns=['Title', 'Description'])

labels = {1:'World News', 2:'Sports News', 3:'Business News', 4:'Science-Technology News'}

df_train['label'] = df_train['class'].map(labels)
df_test['label'] = df_test['class'].map(labels)

In [33]:
df_train = df_train.drop(columns=['class'])
df_test = df_test.drop(columns=['class'])
df_train.head()

Unnamed: 0,summary,label
0,Wall St. Bears Claw Back Into the Black (Reute...,Business News
1,Carlyle Looks Toward Commercial Aerospace (Reu...,Business News
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,Business News
3,Iraq Halts Oil Exports from Main Southern Pipe...,Business News
4,"Oil prices soar to all-time record, posing new...",Business News


In [34]:
# remove punctuation

def remove_punc(text):
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df_train['summary'] = df_train['summary'].apply(lambda x: remove_punc(x))
df_test['summary'] = df_test['summary'].apply(lambda x: remove_punc(x))

In [35]:
df_train.head(10)

Unnamed: 0,summary,label
0,Wall St Bears Claw Back Into the Black Reuters...,Business News
1,Carlyle Looks Toward Commercial Aerospace Reut...,Business News
2,Oil and Economy Cloud Stocks Outlook Reuters R...,Business News
3,Iraq Halts Oil Exports from Main Southern Pipe...,Business News
4,Oil prices soar to alltime record posing new m...,Business News
5,Stocks End Up But Near Year Lows Reuters Reute...,Business News
6,Money Funds Fell in Latest Week AP AP Assets ...,Business News
7,Fed minutes show dissent over inflation USATOD...,Business News
8,Safety Net Forbescom Forbescom After earning ...,Business News
9,Wall St Bears Claw Back Into the Black NEW YO...,Business News


In [36]:
# data cleaning and remove stopwords

def data_cleaner(text):        
    lower_case = text.lower()
    tokens=word_tokenize(lower_case)
    return (" ".join(tokens)).strip()

def remove_stopwords (text):        
    list1=[word for word in text.split() if word not in stopwords.words('english')]
    return " ".join(list1)

df_train['summary'] = df_train['summary'].apply(lambda x: data_cleaner(x))
df_test['summary'] = df_test['summary'].apply(lambda x: data_cleaner(x))

df_train['summary'] = df_train['summary'].apply(lambda x: remove_stopwords(x))
df_test['summary'] = df_test['summary'].apply(lambda x: remove_stopwords(x))

In [37]:
# split the data into train and test data

X_train, X_validation, y_train, y_validation = train_test_split(df_train['summary'], df_train['label'],
                                                                test_size=0.2, random_state=1)

In [38]:
t_d = []
for i in df_train['summary']:
    t_d.append(i.split())
print(t_d[:2])

[['wall', 'st', 'bears', 'claw', 'back', 'black', 'reuters', 'reuters', 'shortsellers', 'wall', 'streets', 'dwindlingband', 'ultracynics', 'seeing', 'green'], ['carlyle', 'looks', 'toward', 'commercial', 'aerospace', 'reuters', 'reuters', 'private', 'investment', 'firm', 'carlyle', 'groupwhich', 'reputation', 'making', 'welltimed', 'occasionallycontroversial', 'plays', 'defense', 'industry', 'quietly', 'placedits', 'bets', 'another', 'part', 'market']]


In [39]:
# initiate word2vec model

w2v_model = Word2Vec(t_d, size=50, workers=32, min_count=1, window=3)
print(w2v_model)

Word2Vec(vocab=89749, size=50, alpha=0.025)


In [40]:
# tokenize the data

token = Tokenizer(89740)
token.fit_on_texts(df_train['summary'])
token_text = token.texts_to_sequences(df_train['summary'])
token_text = pad_sequences(token_text)

In [41]:
la = preprocessing.LabelEncoder()
y = la.fit_transform(df_train['label'])
y = to_categorical(y)
print(y[:5])

[[1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]


In [42]:
# spilt the data into training and testing data

X_train, X_test, y_train, y_test = train_test_split(np.array(token_text), y, test_size=0.2)


In [43]:
# build the model

keras_model = Sequential()
keras_model.add(w2v_model.wv.get_keras_embedding(True))
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(50, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(100, 3, activation='relu', padding='same', strides=1))
keras_model.add(MaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Conv1D(200, 3, activation='relu', padding='same', strides=1))
keras_model.add(GlobalMaxPool1D())
keras_model.add(Dropout(0.2))
keras_model.add(Dense(200))
keras_model.add(Activation('relu'))
keras_model.add(Dropout(0.2))
keras_model.add(Dense(4))
keras_model.add(Activation('softmax'))
keras_model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
keras_model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, None, 50)          4487450   
                                                                 
 dropout_5 (Dropout)         (None, None, 50)          0         
                                                                 
 conv1d_3 (Conv1D)           (None, None, 50)          7550      
                                                                 
 max_pooling1d_2 (MaxPooling  (None, None, 50)         0         
 1D)                                                             
                                                                 
 dropout_6 (Dropout)         (None, None, 50)          0         
                                                                 
 conv1d_4 (Conv1D)           (None, None, 100)         15100     
                                                      

In [44]:
# train the model
keras_model.fit(X_train, y_train, batch_size=256, epochs=20, validation_data=(X_test, y_test))

Epoch 1/20


2021-12-23 10:23:06.125220: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 38400000 exceeds 10% of free system memory.


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7fc602a8cf70>

In [45]:
labels = la.classes_
print(labels)

['Business News' 'Science-Technology News' 'Sports News' 'World News']


In [46]:
# check prediction
predicted = keras_model.predict(X_test)

In [53]:
for i in range(2,100,3):
    print(df_train['summary'].iloc[i][:25], "--")
    print("--Original category: ", labels[np.argmax(y_test[i])])
    print("--predicted category: ", labels[np.argmax(predicted[i])])

oil economy cloud stocks  --
--Original category:  Sports News
--predicted category:  Sports News
stocks end near year lows --
--Original category:  Science-Technology News
--predicted category:  Science-Technology News
safety net forbescom forb --
--Original category:  World News
--predicted category:  World News
need opec pump moreiran g --
--Original category:  Science-Technology News
--predicted category:  Science-Technology News
dollar falls broadly reco --
--Original category:  Sports News
--predicted category:  Sports News
market head toward value  --
--Original category:  World News
--predicted category:  World News
google ipo faces playboy  --
--Original category:  World News
--predicted category:  World News
rand falls shock sa rate  --
--Original category:  Science-Technology News
--predicted category:  Science-Technology News
google auction begins fri --
--Original category:  Science-Technology News
--predicted category:  Science-Technology News
chad seeks refugee aid im --