<a href="https://colab.research.google.com/github/Anpuann/NLP/blob/main/BBC_news.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
##import libraries
import pandas as pd
from textblob import TextBlob       ##NLP related library
import nltk
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df=pd.read_csv('/content/BBC News Train.csv')

In [3]:
df.head(2)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
from nltk.tokenize import word_tokenize, sent_tokenize

In [6]:
target_category = df['Category'].unique()

In [7]:
target_category

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [8]:
df['CategoryID']=df['Category'].factorize()[0]

In [9]:
df.head(10)

Unnamed: 0,ArticleId,Text,Category,CategoryID
0,1833,worldcom ex-boss launches defence lawyers defe...,business,0
1,154,german business confidence slides german busin...,business,0
2,1101,bbc poll indicates economic gloom citizens in ...,business,0
3,1976,lifestyle governs mobile choice faster bett...,tech,1
4,917,enron bosses in $168m payout eighteen former e...,business,0
5,1582,howard truanted to play snooker conservative...,politics,2
6,651,wales silent on grand slam talk rhys williams ...,sport,3
7,1797,french honour for director parker british film...,entertainment,4
8,2034,car giant hit by mercedes slump a slump in pro...,business,0
9,1866,fockers fuel festive film chart comedy meet th...,entertainment,4


In [10]:
category=df[["Category","CategoryID"]].drop_duplicates().sort_values('CategoryID')

In [11]:
category

Unnamed: 0,Category,CategoryID
0,business,0
3,tech,1
5,politics,2
6,sport,3
7,entertainment,4


In [12]:
df.groupby('Category').CategoryID.count()

Category
business         336
entertainment    273
politics         274
sport            346
tech             261
Name: CategoryID, dtype: int64

In [13]:
text = df["Text"]

In [14]:
text.head()

0    worldcom ex-boss launches defence lawyers defe...
1    german business confidence slides german busin...
2    bbc poll indicates economic gloom citizens in ...
3    lifestyle  governs mobile choice  faster  bett...
4    enron bosses in $168m payout eighteen former e...
Name: Text, dtype: object

In [15]:
##Extract number of words
def add_num_words(df):
    df['number_of_words'] = df['Text'].apply(lambda x : len(TextBlob(str(x)).words))
    return df

In [16]:
add_num_words(df)['number_of_words']

0       300
1       324
2       513
3       632
4       353
       ... 
1485    222
1486    557
1487    235
1488    557
1489    295
Name: number_of_words, Length: 1490, dtype: int64

In [17]:
wh_words = set(['why', 'who', 'which', 'what', 'where', 'when', 'how'])

In [18]:
from tensorflow import keras
from keras.preprocessing.text import text_to_word_sequence
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download ('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [19]:
def preprocessDataset(df_text):
       
#word tokenization using text-to-word-sequence
    df_text= str(df_text)
    tokenized_train_set = text_to_word_sequence(df_text,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',lower=True,split=" ")

#stop word removal
    stop_words = set(stopwords.words('english'))
    stopwordremove = [i for i in tokenized_train_set if not i in stop_words]

#join words into sentence
    stopwordremove_text = ' '.join(stopwordremove)

#remove numbers
    numberremove_text = ''.join(c for c in stopwordremove_text if not c.isdigit())

#Stemming
    stemmer= PorterStemmer()

    stem_input=nltk.word_tokenize(numberremove_text)
    stem_text=' '.join([stemmer.stem(word) for word in stem_input])
        
        
    lemmatizer = WordNetLemmatizer()
    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    lem_input = nltk.word_tokenize(stem_text)
    lem_text= ' '.join([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in lem_input])
        
    return lem_text


In [20]:
df['Text'] = df['Text'].apply(preprocessDataset)
text = df['Text']
category = df['Category']
text.head()

0    worldcom ex bos launch defenc lawyer defend fo...
1    german busi confid slide german busi confid fe...
2    bbc poll indic econom gloom citizen major nati...
3    lifestyl govern mobil choic faster well funkie...
4    enron bos m payout eighteen former enron direc...
Name: Text, dtype: object

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(text,category, test_size = 0.3, random_state = 60,shuffle=True, stratify=category)

print(len(X_train))
print(len(X_test))

1043
447


In [22]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [23]:
rfc = Pipeline([('tfidf', TfidfVectorizer()),
                ('rfc', RandomForestClassifier(n_estimators=100)),
               ])

rfc.fit(X_train, Y_train)

test_predict = rfc.predict(X_test)

train_accuracy = round(rfc.score(X_train,Y_train)*100)
test_accuracy =round(accuracy_score(test_predict, Y_test)*100)

print("K-Nearest Neighbour Train Accuracy Score : {}% ".format(train_accuracy ))
print("K-Nearest Neighbour Test Accuracy Score  : {}% ".format(test_accuracy ))
print()
print(classification_report(test_predict, Y_test, target_names=target_category))

K-Nearest Neighbour Train Accuracy Score : 100% 
K-Nearest Neighbour Test Accuracy Score  : 96% 

               precision    recall  f1-score   support

     business       0.98      0.93      0.96       106
         tech       0.93      1.00      0.96        76
     politics       0.93      0.93      0.93        82
        sport       1.00      0.96      0.98       108
entertainment       0.94      0.97      0.95        75

     accuracy                           0.96       447
    macro avg       0.95      0.96      0.96       447
 weighted avg       0.96      0.96      0.96       447

