# 20 NewsGroup NLP Classification

In [None]:
import pandas as pd 
import numpy as np 
from scipy import stats
import matplotlib.pyplot as plt
import os
from os.path import isfile, join
import string
import time
import re
from string import punctuation
import sys

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.tokenize import RegexpTokenizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder , StandardScaler , MaxAbsScaler 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer
import itertools

import tensorflow as tf
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Input, Dropout
from keras import Sequential
from keras import metrics

from keras.models import load_model
import joblib

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    # print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

def timing(f):
    def wrap(*args, **kwargs):
        time1 = time.time()
        ret = f(*args, **kwargs)
        time2 = time.time()
        print('{:s} function took {:.3f} ms'.format(f.__name__, (time2-time1)*1000.0))

        return ret
    return wrap

def create_model(optimizer="adam",
                 dense_layer_sizes = False,
                 dropout=0.1, init='uniform',
                 features=3000,neurons=20,
                 n_classes = 8 ):
    
    model = Sequential()
    model.add(Dense(neurons, activation='relu', input_shape=(features,),kernel_initializer=init)) #
    model.add(Dropout(dropout), )    

    #for layer_size in dense_layer_sizes:
    #   model.add(Dense(layer_size, activation='relu'))
    #   model.add(Dropout(dropout), )    
    
    model.add(Dense(n_classes, activation='softmax')) # because we want the probability of each class as output , len 8 
    model.compile(loss='sparse_categorical_crossentropy', # sparse because it can accept the integer cat as y . len1 
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

def downsample(df):
    minority_frequency  = df['label'].value_counts()[-1]
    minority_label = df['label'].value_counts().index[-1]
    
    df_balanced = df.loc[df['label'] == minority_label , : ].sample(minority_frequency).copy()
    df_balanced = df_balanced.reset_index(drop = True)
    
    label_list = df['label'].value_counts().index.tolist()
    #Sample and concat
    for label in label_list:
        if label != minority_label:
            sample_df = df.loc[df['label'] == label , : ].sample(minority_frequency).copy()
            df_balanced = pd.concat([ df_balanced , sample_df],axis = 0 , ignore_index=True) 
    # Shuffle data
    df_balanced = df_balanced.sample(frac = 1).reset_index(drop = True)
    
    return df_balanced

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()
    
#Need OPT
def spacy_preprocessing(text_format):
        
    def combine_text(list_of_text):
            combined_text = ' '.join(list_of_text)
            return combined_text
        
    doc  = nlp(text_format)
    tokens_list  = [ token for token in doc if not token.is_punct and not token.is_space and token.is_alpha]
    filter_token_sw = [token.lemma_ for token in tokens_list if str.lower(str.strip(token.lemma_)) not in stopwords.words('english')]
   
    return combine_text(filter_token_sw)

def word_freq(label): return pd.Series([ t for t in nlp(grouped_vocabolary[label])] ).value_counts()

vec_prop = np.vectorize(spacy_preprocessing)
pipe_spacy_preprocessing = FunctionTransformer(vec_prop)
prep_pipeline = Pipeline([
                    ('text_preprocessing', pipe_spacy_preprocessing )
                    ])


def make_custom_predictions(fitted_pipe):
    prep_text = pipe_spacy_preprocessing.transform( pd.Series(input('Input-Text:')))
    return label_encoder.inverse_transform( fitted_pipe.predict(prep_text))


from wordcloud import WordCloud, ImageColorGenerator
from collections import Counter

In [None]:
#______________________________________________________ DATA INGESTION___________________________________________________________________
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
df = pd.DataFrame()
df['text'] = dataset.data
df['source'] = dataset.target
label=[]
for i in df['source']:
    label.append(dataset.target_names[i])
df['label']=label
df.drop(['source'],axis=1,inplace=True)

#++++++++++++++++++++++++++++++++++++++++Macro Categories++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
key_categories = ['politics','sport','religion','computer','sales','automobile','science','medicine']
cat_dict = {
**dict.fromkeys(['talk.politics.misc','talk.politics.guns','talk.politics.mideast'],'politics'),
**dict.fromkeys( ['rec.sport.hockey','rec.sport.baseball'],'sport'),
**dict.fromkeys( ['soc.religion.christian','talk.religion.misc'],'religion'),
**dict.fromkeys(['comp.windows.x','comp.sys.ibm.pc.hardware','comp.os.ms-windows.misc','comp.graphics','comp.sys.mac.hardware'],'computer'),
**dict.fromkeys( ['misc.forsale'],'sales'),
**dict.fromkeys( ['rec.autos','rec.motorcycles'],'automobile'),
**dict.fromkeys( ['sci.crypt','sci.electronics','sci.space'],'science'),
**dict.fromkeys( ['sci.med'],'medicine') 
}
df['label']=df['label'].map(cat_dict)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# Encode categorical variables in integer levels 
label_encoder = LabelEncoder()  
# Encode labels in column 
df['target']= label_encoder.fit_transform(df['label'])

#SHuffle
df = df.sample(frac = 1)
# DROPNAN
df = df.dropna()
# dependent and independent variable
X = df['text']
y = df['target']
#_____________________________________________________________________________________________________________________________________________
df['label'].value_counts()

In [None]:
print(df.shape)
df.head()

# Data Exploration

In [None]:
# Downsample dataset to make it balanced without changing the data and make eda faster
df_balanced = downsample(df)

# Create Tokens and text as columns
df_s = df_balanced.copy()

#+++++++++++++++++++++++ Create Token and clean them ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
df_s['tokens'] = df_s['text'].apply( lambda x : [token for token in nlp(x) if ( not token.is_punct and \
                                                                                not token.is_space  and \
                                                                                token.is_alpha and \
                                                                                str.lower(str.strip(token.lemma_)) not in stopwords.words('english')  ) ] )
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

df_s['label'].value_counts()

In [None]:
df_s['lemma_'] = df_s['tokens'].apply(lambda x: [ str.strip(token.lemma_) for token in  x  ] )

# Length of the list of lemma in text
df_s['len_lemma_text'] = df_s['lemma_'].apply(lambda x: len(x) )
# Unique words in lemma
df_s['lemma_unique'] = df_s['lemma_'].apply(lambda x: list(set(x)) ) 
# Lenght of Unique words in lemma
df_s['len_lemma_unique_text'] = df_s['lemma_unique'].apply(lambda x:len( set(x) ) )
# Proportion of unique lemma on the lenght of NON unique lemma , the lower the value the more the words in text are repeated
df_s['richness_text'] = df_s['len_lemma_unique_text']/df_s['len_lemma_text'] 

# Drop NAN if text was empty after tokenization
df_s = df_s.loc[ df_s['len_lemma_text'] != 0  , : ]

# Balance again 
df_s = downsample(df_s)

In [None]:
fig = plt.figure(figsize=(15,15))
fig.suptitle('Distribution of Text Richness per category',fontsize=25)
for n,i in enumerate( df_s['label'].unique() ):
    ax = fig.add_subplot(3,4,n+1)
    abs_freq = df_s.loc[ df_s['label'] == i,'richness_text'] 
    (abs_freq).hist(ax  = ax )
    values = df_s.loc[ df_s['label'] == i,'richness_text']
    ax.set_title(str.upper(i) + f' skewness: {round(stats.skew(values),3)  } ' )
#     ax.axis('off')

In [None]:
from itertools import chain
group_class_docs = df_s.groupby('label')['lemma_unique'].apply(list) 
group_class_uniqe_texts = group_class_docs.apply(lambda x : list(itertools.chain(*x)))

In [None]:
vocabolary_len_per_category = pd.Series({cat :len(group_class_uniqe_texts.apply(lambda x:list(set(x)))[cat]) for cat in df_s.label.unique() })
vocabolary_len_per_category.sort_values().plot(kind = 'barh')

In [None]:
min(vocabolary_len_per_category)

In [None]:
grouped_vocabolary = group_class_uniqe_texts.apply(lambda x:str(' '.join(x)))
vocabulary_len_category = dict({ k:len(word_freq(k).index.to_list()) for k in grouped_vocabolary.index.to_list() } )

fig = plt.figure(figsize=(10,10))
for n,i in enumerate(grouped_vocabolary.index.to_list()):
    ax = fig.add_subplot(4,2,n+1)
    wordcloud = WordCloud(max_words=3000,background_color="white").generate(str(grouped_vocabolary[i]))
    ax.set_title(str.upper(i) ) 
    ax.axis('off')
    ax.imshow(wordcloud,interpolation="bilinear")
plt.show()

In [None]:
[ token for token in nlp(df_s.loc[0,'text']) if ( not token.is_punct and \
                                          not token.is_space  and \
                                          token.is_alpha and \
                                          str.lower(str.strip(token.lemma_)) not in stopwords.words('english')  ) ] 

# Preprocessing Pipeline 

In [None]:
vec_prop = np.vectorize(spacy_preprocessing)
pipe_spacy_preprocessing = FunctionTransformer(vec_prop)

prep_pipeline = Pipeline([
                    ('text_preprocessing', pipe_spacy_preprocessing )
                    ])

# DEFINE LABELS IN OHE FORMAT
yc = tf.keras.utils.to_categorical(y,num_classes = 8,dtype=int )

In [None]:
#++++++++++++++++++++++++++++ Preprocessing informative texts balances ++++++++++++++++++++++++++++++++++++
X = df_s['text']
y = df_s['target']

# Transform text
X_t = prep_pipeline.fit_transform(X)
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

# Model Pipeline- Approach TF-IDF

In [None]:
#++++++++++++++++++++++++++++++++++++++++++ Pipeline Defition and Params +++++++++++++++++++++++++++++++++

# TF IDF DIMENSION will affect the model 
matrix_features  = 6000 # may it be an approximation of the min len of vocabolary per each category

n_classes =y.nunique()

#dense_layer_sizes = [[] , []]
#param_grid = dict(neurons=neurons, epochs = epochs, batch_size =batch_size)
param_grid = {
    'tfidf__ngram_range': [(1,1)],
    'kc__epochs': [20,30,50],
    'kc__neurons': [10, 20, 30, 100],
    'kc__batch_size':[16, 32,50],
    'kc__dropout': [ 0.3, 0.1, 0]
}


model_pipeline = Pipeline([
                    ('tfidf', TfidfVectorizer(use_idf = True,max_features=matrix_features)),
                    ('sparse_to_dense',DenseTransformer()),
                    ('scaler', MaxAbsScaler()),
                    ('kc' ,KerasClassifier(build_fn=create_model, verbose = 0))
])

folds = 3
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
grid = GridSearchCV(estimator=model_pipeline,
                    verbose=1,
                    cv=skf.split(X_t,y),
                    param_grid=param_grid,
                    scoring='accuracy')


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
from sklearn import set_config
set_config(display='diagram')
grid

## Hyperparameters-tuning  and CrossValidation

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


#++++++++++++++++++++++++++++++ RUN GRID ++++++++++++++++++++++++++++++++++++
t0 = time.time()
grid_fitted = grid.fit(X_t,y) # Pipe line fitted with preprocessed clean text spacy
results  =  pd.DataFrame(grid_fitted.cv_results_).sort_values('rank_test_score')

t1 = time.time()
delta = t1-t0
print(f'Tuning Time s: {round(delta,3)}')
display( results.head() ) 


#+++++++++++++++++++++++ BEST PIPE PARAMS ++++++++++++++++++++++++++++++++
opt_pipeline  = grid_fitted.best_estimator_

t0 = time.time()
fitted_pipe = opt_pipeline.fit(X_t,y)
time.time() - t0 

In [None]:
save_keras_pipe(pipeline,name_model = f'keras_model_{matrix_features}.h5', name_pipe =f'sklearn_pipeline_{matrix_features}.pkl' )

# Model Pipeline- Approach Word Embedding

In [None]:
import tensorflow_hub as hub
# load embeddings model from Tensorflow Hub
#https://stackoverflow.com/questions/62464152/universal-sentence-encoder-load-error-error-savedmodel-file-does-not-exist-at
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


matrix_features  = 512
def create_model(optimizer="adam",
                 dense_layer_sizes = False,
                 dropout=0.1, init='uniform',
                 features=matrix_features,neurons=20,
                 n_classes = n_classes ):
    
    model = Sequential()
    model.add(Dense(neurons, activation='relu', input_shape=(features,),kernel_initializer=init)) #
    model.add(Dropout(dropout), )    

    #for layer_size in dense_layer_sizes:
    #   model.add(Dense(layer_size, activation='relu'))
    #   model.add(Dropout(dropout), )    
    
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])
    return model

n_classes =y.nunique()


In [None]:
class Embedding(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return embed(X).numpy()
    
emb_pipeline = Pipeline([
                    ('embed', Embedding()),
                    ('kc' ,KerasClassifier(build_fn=create_model, verbose = 0))
])

#++++++++++++++++++++++++++++++ GRID ++++++++++++++++++++++++++++++++++++++
param_grid = {
    'kc__epochs': [20,30,50],
    'kc__neurons': [10, 20, 30, 100],
    'kc__batch_size':[16, 32,50],
    'kc__dropout': [ 0.3, 0.1, 0]
}

folds = 3
skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
grid = GridSearchCV(estimator=emb_pipeline,
                    verbose=1,
                    cv=skf.split(X_t,y),
                    param_grid=param_grid,
                    scoring='accuracy')

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

t0 = time.time()
grid_fitted = grid.fit(X_t,y) # Pipe line fitted with preprocessed clean text spacy
results  =  pd.DataFrame(grid_fitted.cv_results_).sort_values('rank_test_score')

t1 = time.time()
delta = t1-t0
print(f'Tuning Time s: {round(delta,3)}')
display( results.head() )

In [None]:
#++++++++++++++++++++++++++++++++++++++++++++++ Fit the best model for embedding ++++++++++++++++++++++++++++++++
emb_class = Embedding().fit(X_t)
keras_model_emb = create_model(features=512, neurons=30, n_classes=8, dropout=0.3).fit(emb_class.transform(X_t),
                                                                                       y,
                                                                                       batch_size = 30,
                                                                                       epochs = 50 ,
                                                                                       verbose= 0  )

# Testing 

In [None]:
dataset = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
df = pd.DataFrame()
df['text'] = dataset.data
df['source'] = dataset.target
label=[]
for i in df['source']:
    label.append(dataset.target_names[i])
df['label']=label
df.drop(['source'],axis=1,inplace=True)

#++++++++++++++++++++++++++++++++++++++++Macro Categories++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
key_categories = ['politics','sport','religion','computer','sales','automobile','science','medicine']
cat_dict = {
**dict.fromkeys(['talk.politics.misc','talk.politics.guns','talk.politics.mideast'],'politics'),
**dict.fromkeys( ['rec.sport.hockey','rec.sport.baseball'],'sport'),
**dict.fromkeys( ['soc.religion.christian','talk.religion.misc'],'religion'),
**dict.fromkeys(['comp.windows.x','comp.sys.ibm.pc.hardware','comp.os.ms-windows.misc','comp.graphics','comp.sys.mac.hardware'],'computer'),
**dict.fromkeys( ['misc.forsale'],'sales'),
**dict.fromkeys( ['rec.autos','rec.motorcycles'],'automobile'),
**dict.fromkeys( ['sci.crypt','sci.electronics','sci.space'],'science'),
**dict.fromkeys( ['sci.med'],'medicine') 
}
df['label']=df['label'].map(cat_dict)
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++


label_encoder = LabelEncoder()  
# Encode labels in column 'species'.
df['target']= label_encoder.fit_transform(df['label'])

# drop non in categories
df = df.loc[df['label'].isin(key_categories)]
#++++++++++++++++++++++++ PICK RANDOM 30 % OF TEST++++++++++++++++++++++++++
df = df.sample(frac = 1) 
# dependent and independent variable
X_test = df['text']
y_test = df['target']
#_____________________________________________________________________________________________________________________________________________

### Load Models and Predict

In [None]:
X_test_prep = prep_pipeline.transform(X_test)

In [None]:
# Load the pipeline first:
chosen_pipe = 'sklearn_pipeline_6000.pkl'
pipeline = joblib.load(chosen_pipe)
# Then, load the Keras model:

chosen_model = 'keras_model_6000.h5'
pipeline.named_steps['kc'].model = load_model(chosen_model) 

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

#X_test_prep = prep_pipeline.transform(X_test)
y_pred  = pipeline.predict(X_test_prep)
cnf_matrix = confusion_matrix(y_test,y_pred)

# Plot normalized confusion matrix
fig = plt.figure()
fig.set_size_inches(14, 12, forward=True)
#fig.align_labels()

label_names = df['label'].unique()

print(classification_report(y_test,y_pred, target_names=label_names))
# fig.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0)
plot_confusion_matrix(cnf_matrix, classes=np.asarray(label_names), normalize=False,
                      title='Confusion matrix')

In [None]:
y_pred = np.argmax( keras_model_emb.model.predict( emb_class.transform(X_test_prep) )  , axis=-1 ) 

cnf_matrix = confusion_matrix(y_test,y_pred)

# Plot normalized confusion matrix
fig = plt.figure()
fig.set_size_inches(14, 12, forward=True)
#fig.align_labels()

label_names = df['label'].unique()

print(classification_report(y_test,y_pred, target_names=label_names))
# fig.subplots_adjust(left=0.0, right=1.0, bottom=0.0, top=1.0)
plot_confusion_matrix(cnf_matrix, classes=np.asarray(label_names), normalize=False,
                      title='Confusion matrix')