In [None]:
# general packages
import pandas as pd
import re, string
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# sklearn packages
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from skopt import BayesSearchCV
from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline as imbPipeline
from imblearn.combine import SMOTETomek

#natural language packages
from string import punctuation
import unidecode
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

# Loading datasets

In [None]:
df_train = pd.read_csv("../input/declutter20v2/train_set_0520.csv")
df_pred = pd.read_csv("../input/declutter20v2/test_set_0520.csv")

Now that we have the data loaded let us take a look at it and see what columns we don't really need

In [None]:
df_train

I would guess that we don't need most of these columns so I am going to drop `ID`, `path_to_file`, and `link_to_comment`, but I will make sure to the save it to a new variable just incase.

In [None]:
df_train_copy = df_train.drop(['ID', 'path_to_file', 'link_to_comment'], axis = 1)
df_pred_copy = df_pred.drop(['ID', 'path_to_file', 'link_to_comment'], axis = 1)

# Data Cleaning & Text Pre-processing

### Make all text lower case

In [None]:
df_train_copy['type'] = df_train_copy['type'].str.lower()
df_train_copy['comment'] = df_train_copy['comment'].str.lower()

### Convert `type` column to a numerical categorical

In [None]:
df_train_copy['type'].value_counts()

In [None]:
df_train_copy['type'] = df_train_copy['type'].map({'line': 0, 'javadoc': 1, 'block':2})
df_train_copy['type'].value_counts()

In [None]:
df_pred_copy['type'] = df_pred_copy['type'].map({'line': 0, 'javadoc': 1, 'block':2})

### Delete duplicates

In [None]:
df_train_copy.duplicated().sum()

In [None]:
df_train_copy.drop_duplicates(subset=['comment'], keep='first', inplace=True)

In [None]:
df_train_copy.duplicated().sum()

### Convert `non-information` column of training set to numerical categorical

In [None]:
df_train_copy['non-information'] = df_train_copy['non-information'].map({'no': 0, 'yes': 1})

In [None]:
df_train_copy['non-information'].value_counts()

So it appears that the training set is a little imbalanced towards non-informative comments, this will be dealt with by using SMOTE sampling. 

### Drop any rows with null values

In [None]:
df_train_copy.isna().sum()

In [None]:
df_pred_copy.isna().sum()

Looks like `df_train_copy` has some null values, let's drop those.\
The null values in `df_pred_copy` come from the `expected` column which will later be replaced by our predictions so that isn't an issue.

In [None]:
df_train_copy.dropna(inplace=True)

In [None]:
df_train_copy.isna().sum()

No more null values in `df_train_copy`.\
Let's see what our dataframes look like now.

In [None]:
df_train_copy

In [None]:
df_pred_copy

Looking good so far, seems that the only remaining thing left to do is extract some simple features out the the `comment` column like word count, character count, etc.\
Then we need to processing the `comment` column into word vectors.

In [None]:
# count number of characters 
def count_chars(text):
    return len(text)

# count number of words 
def count_words(text):
    return len(text.split())

# count of stopwords
def count_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

def contains_annotation(text):
    if '@' in text:
        return 1
    else:
        return 0

In [None]:
df_train_copy['char_count'] = df_train_copy['comment'].apply(lambda x:count_chars(x))
df_train_copy['word_count'] = df_train_copy['comment'].apply(lambda x:count_words(x))
df_train_copy['stop_word_count'] = df_train_copy['comment'].apply(lambda x:count_stopwords(x))
df_train_copy['contains_annotation'] = df_train_copy['comment'].apply(lambda x:contains_annotation(x))

In [None]:
df_pred_copy['char_count'] = df_pred_copy['comment'].apply(lambda x:count_chars(x))
df_pred_copy['word_count'] = df_pred_copy['comment'].apply(lambda x:count_words(x))
df_pred_copy['stop_word_count'] = df_pred_copy['comment'].apply(lambda x:count_stopwords(x))
df_pred_copy['contains_annotation'] = df_pred_copy['comment'].apply(lambda x:contains_annotation(x))

In [None]:
display(df_train_copy.head(), df_pred_copy.head())

Now that we have generated some more features from the `comment` column it's time to get our pre-processing pipeline set up.\
# Set up helper functions and pipeline

In [None]:
#define our stop words
stop_words_nltk = stopwords.words('english')

# list of word types (nouns and adjectives) to leave in the text
defTags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ', 'JJS', 'JJR']#, 'RB', 'RBS', 'RBR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

# functions to determine the type of a word
def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

# transform tag forms
def penn_to_wn(tag):
    if is_adjective(tag):
        return nltk.stem.wordnet.wordnet.ADJ
    elif is_noun(tag):
        return nltk.stem.wordnet.wordnet.NOUN
    elif is_adverb(tag):
        return nltk.stem.wordnet.wordnet.ADV
    elif is_verb(tag):
        return nltk.stem.wordnet.wordnet.VERB
    return nltk.stem.wordnet.wordnet.NOUN

# lemmatizer + tokenizer (+ stemming) class
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
        # we define (but not use) a stemming method, uncomment the last line in __call__ to get stemming tooo
        self.stemmer = nltk.stem.SnowballStemmer('english') 
    def __call__(self, doc):
        # pattern for numbers | words of length=2 | punctuations | words of length=1
        pattern = re.compile(r'[0-9]+|\b[\w]{2,2}\b|[%.,_`!"&?\')({~@;:#}+-]+|\b[\w]{1,1}\b')
        # tokenize document
        doc_tok = word_tokenize(doc)
        #filter out patterns from words
        doc_tok = [x for x in doc_tok if x not in stop_words_nltk]
        doc_tok = [pattern.sub('', x) for x in doc_tok]
        # get rid of anything with length=1
        doc_tok = [x for x in doc_tok if len(x) > 1]
        # position tagging
        doc_tagged = nltk.pos_tag(doc_tok)
        # selecting nouns and adjectives
        doc_tagged = [(t[0], t[1]) for t in doc_tagged if t[1] in defTags]
        # preparing lemmatization
        doc = [(t[0], penn_to_wn(t[1])) for t in doc_tagged]
        # lemmatization
        doc = [self.wnl.lemmatize(t[0], t[1]) for t in doc]
        # uncomment if you want stemming as well
        #doc = [self.stemmer.stem(x) for x in doc]
        return doc

In [None]:
vec_tdidf = TfidfVectorizer(ngram_range=(1,1), analyzer='word', #stop_words=stop_words1, 
                                               norm='l2', tokenizer=LemmaTokenizer())

In [None]:
clf = XGBClassifier(random_state=42, seed=2, n_estimators=300, use_label_encoder=False, eval_metric='logloss')

In [None]:
#sm = SMOTETomek(random_state=42, n_jobs = -1)

In [None]:
class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None, *parg, **kwarg):
        return self

    def transform(self, X):
        # returns the input as a string
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # returns the input as a dataframe
        return X[[self.key]]

In [None]:
def print_stats(preds, target, labels, sep='-', sep_len=40, fig_size=(10,8)):
    print('Accuracy = %.3f' % metrics.accuracy_score(target, preds))
    print(sep*sep_len)
    print('Classification report:')
    print(metrics.classification_report(target, preds))
    print(sep*sep_len)
    print('Confusion matrix')
    cm=metrics.confusion_matrix(target, preds)
    cm = cm / np.sum(cm, axis=1)[:,None]
    sns.set(rc={'figure.figsize':fig_size})
    sns.heatmap(cm, 
        xticklabels=labels,
        yticklabels=labels,
           annot=True, cmap = 'YlGnBu')
    plt.pause(0.05)

Now that we have all of our helpers set up it's time to get the pipeline set up.\
First we will start with a our feature selector pipelines.\
`text` is a pipeline to select textual features. In our case that is only comment column.\

In [None]:
text = Pipeline([('selector', TextSelector(key='comment')), ('vectorizer', vec_tdidf)])

Next we will create one pipeline for each of our numerical features:
* `type`
* `begin_line`
* `char_count`
* `word_count`
* `stop_word_count`

In [None]:
types =  Pipeline([('selector', NumberSelector(key='stop_word_count')),])

line_num = Pipeline([('selector', NumberSelector(key='begin_line')),])

chars = Pipeline([('selector', NumberSelector(key='char_count')),])

words = Pipeline([('selector', NumberSelector(key='word_count')),])

stop_words =  Pipeline([('selector', NumberSelector(key='stop_word_count')),])

annotation = Pipeline([('selector', NumberSelector(key='contains_annotation')),])

Here comes the tricky part. To combine all feature, we use the FeatureUnion object. That makes sure there will not be any errors from combining text and number based inputs.

In [None]:
feats = FeatureUnion([('comment', text),
                      ('type', types),
                      ('begin_line', line_num),
                      ('char_count', chars),
                      ('word_count', words),
                      ('stop_word_count', stop_words),
                      ('contains_annotation', annotation)
                      ])

Now we can combine the input features and the classifier into a single pipeline.

In [None]:
#pipe = imbPipeline([('feats', feats),('smote', sm),('clf',clf)])
pipe = Pipeline([('feats', feats),('clf',clf)])

Now that our pipeline is build we can start hyperparameter tuning.
# Hyper parameter tuning
First we need to split the training data into a training set and a test set.\
This is to determine if our hyper parameter tuning is actually improving our performance.

In [None]:
# split the data into train and test
combined_features = ['comment', 'type', 'begin_line', 'char_count', 'word_count', 'stop_word_count', 'contains_annotation']
target = 'non-information'

X_train, X_test, y_train, y_test = train_test_split(df_train_copy[combined_features], df_train_copy[target], test_size=0.33, random_state=42, stratify=df_train_copy[target])

Next we have to define the parameters we are going to tune. This is down below.

In [None]:
# definition of parameter grid to scan through
param_space = {'clf__colsample_bytree': [1], 
               'clf__n_estimators': [50], 
               'clf__subsample': [0.6],
               'clf__eta': [0.01, 0.15, 0.2, 0.25, 0.3, 0.35]}


Now it's time to do the grid search through our parameters.

In [None]:
# grid search cross validation instantiation
grid_search = GridSearchCV(estimator = pipe, param_grid = param_space, cv = 5, scoring = 'accuracy', n_jobs = -1, verbose = 0, return_train_score=True)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.cv_results_['mean_train_score']

In [None]:
grid_search.cv_results_['mean_test_score']

In [None]:
grid_search.best_params_

In [None]:
clf_test = grid_search.best_estimator_

In [None]:
# test stats
preds = clf_test.predict(X_test)

In [None]:
print_stats(y_test, preds, clf_test.classes_)

# Predicting if comments are informative or not on new data (`df_pred_copy`)

In [None]:
combined_features = ['comment', 'type', 'begin_line', 'char_count', 'word_count', 'stop_word_count', 'contains_annotation']
X_pred = df_pred_copy[combined_features]
X_pred

In [None]:
new_preds = clf_test.predict(X_pred)

In [None]:
pred_series = pd.Series(new_preds).map({0: 'no', 1: 'yes'})

In [None]:
d = {'ID': df_pred['ID'], 'Predicted': pred_series}
submission = pd.DataFrame(data=d)
submission.to_csv('submission.csv', index=False)

References:
* https://www.kaggle.com/diveki/classification-with-nlp-xgboost-and-pipelines
* https://www.analyticsvidhya.com/blog/2021/04/a-guide-to-feature-engineering-in-nlp/
    * https://github.com/mohdahmad242/Feature-Engineering-in-NLP/blob/main/Feature_engineering_NLP.ipynb
* https://github.com/scikit-learn-contrib/imbalanced-learn