# Preprocessing Phase

In [None]:
import nltk

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     C:\Users\guill\AppData\Roaming\nltk_data...
[nltk_data]    | 

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string as st
import re
from nltk import PorterStemmer, WordNetLemmatizer

# Input data files are available in the read-only "./input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('./input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Read the data. Here it is already in .csv format.
train_data = pd.read_csv('/dataset/BBC News Train.csv')
test_data = pd.read_csv('/dataset/BBC News Test.csv')
train_data.head(10)
test_data.head(10)

In [None]:
train_data.shape
test_data.shape

# Text cleaning and processing steps
* Remove punctuations
* Convert text to tokens
* Remove tokens of length less than or equal to 3
* Remove stopwords using NLTK corpus stopwords list to match
* Apply stemming
* Apply lemmatization
* Convert words to feature vectors

In [None]:
# Remove all punctuations from the text

def remove_punct(text):
    return ("".join([ch for ch in text if ch not in st.punctuation]))

In [None]:
train_data['removed_punc'] = train_data['Message'].apply(lambda x: remove_punct(x))
test_data['removed_punc'] = test_data['Message'].apply(lambda x: remove_punct(x))
train_data.head()
test_data.head()

In [None]:
''' Convert text to lower case tokens. Here, split() is applied on white-spaces. But, it could be applied
    on special characters, tabs or any other string based on which text is to be seperated into tokens.
'''
def tokenize(text):
    text = re.split('\s+' ,text)
    return [x.lower() for x in text]

In [None]:
train_data['tokens'] = train_data['removed_punc'].apply(lambda msg : tokenize(msg))
test_data['tokens'] = test_data['removed_punc'].apply(lambda msg : tokenize(msg))
train_data.head()
test_data.head()

In [None]:
# Remove tokens of length less than 3

def remove_small_words(text):
    return [x for x in text if len(x) > 3 ]

In [None]:
train_data['larger_tokens'] = train_data['tokens'].apply(lambda x : remove_small_words(x))
test_data['larger_tokens'] = test_data['tokens'].apply(lambda x : remove_small_words(x))
train_data.head()
test_data.head()

In [None]:
''' Remove stopwords. Here, NLTK corpus list is used for a match. However, a customized user-defined 
    list could be created and used to limit the matches in input text. 
'''
def remove_stopwords(text):
    return [word for word in text if word not in nltk.corpus.stopwords.words('english')]

In [None]:
train_data['clean_tokens'] = train_data['larger_tokens'].apply(lambda x : remove_stopwords(x))
test_data['clean_tokens'] = test_data['larger_tokens'].apply(lambda x : remove_stopwords(x))
train_data.head()
test_data.head()

### Lemmatization converts word to it's dictionary base form. This process takes language grammar and vocabulary into consideration while conversion. Hence, it is different from Stemming in that it does not merely truncate the suffixes to get the root word.


In [None]:
# Apply lemmatization on tokens
def lemmatize(text):
    word_net = WordNetLemmatizer()
    return [word_net.lemmatize(word) for word in text]

In [None]:
train_data['lemma_words'] = train_data['clean_tokens'].apply(lambda x : lemmatize(x))
test_data['lemma_words'] = test_data['clean_tokens'].apply(lambda x : lemmatize(x))
train_data.head()
test_data.head()

In [None]:
# Create sentences to get clean text as input for vectors

def return_sentences(tokens):
    return " ".join([word for word in tokens])

In [None]:
train_data['clean_text'] = train_data['lemma_words'].apply(lambda x : return_sentences(x))
test_data['clean_text'] = test_data['lemma_words'].apply(lambda x : return_sentences(x))
train_data.head()
test_data.head()

# Model and Evaluation Phase

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(data['text'], data['category'], test_size=0.2, random_state=42)
X_train = train_data
X_train.pop("ArticleId")
X_test = test_data
X_test.pop("ArticleId")
y_train = X_train.pop("Category")
y_test = pd.read_csv("dataset/BBC News Sample Solution.csv")
y_test.pop("ArticleId")
X_train

In [None]:
#vectorizer = CountVectorizer()
#X_train_vec = vectorizer.fit_transform(X_train)
#X_test_vec = vectorizer.transform(X_test)

# OR

vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
def train_and_evaluate(classifier, X_train_vec, y_train, X_test_vec, y_test):
    classifier.fit(X_train_vec, y_train)
    y_pred = classifier.predict(X_test_vec)
    accuracy = accuracy_score(y_test, y_pred)
    print(classifier.__class__.__name__)
    print("Accuracy:", accuracy)
    print("Classification Report:\n", classification_report(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("\n")

In [None]:
classifiers = [
    MultinomialNB(),
    LogisticRegression(),
    LinearSVC(),
    RandomForestClassifier(),
    KNeighborsClassifier()
]

for classifier in classifiers:
    train_and_evaluate(classifier, X_train_vec, y_train, X_test_vec, y_test)