In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import os
from os.path import isfile, join
import string
import re
from string import punctuation
import sys

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.tokenize import RegexpTokenizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder , StandardScaler , MaxAbsScaler 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import TransformerMixin
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer


import tensorflow as tf
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Input, Dropout
from keras import Sequential
from keras import metrics


# Data Acquisition

In [2]:
#______________________________________________________ DATA INGESTION___________________________________________________________________
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
df = pd.DataFrame()
df['text'] = dataset.data
df['source'] = dataset.target
label=[]
for i in df['source']:
    label.append(dataset.target_names[i])
df['label']=label
df.drop(['source'],axis=1,inplace=True)


# key_categories = ['politics','sport','religion','computer','sales','automobile','science','medicine']
# cat_dict = {
# **dict.fromkeys(['talk.politics.misc','talk.politics.guns','talk.politics.mideast'],'politics'),
# **dict.fromkeys( ['rec.sport.hockey','rec.sport.baseball'],'sport'),
# **dict.fromkeys( ['soc.religion.christian','talk.religion.misc'],'religion'),
# **dict.fromkeys(['comp.windows.x','comp.sys.ibm.pc.hardware','comp.os.ms-windows.misc','comp.graphics','comp.sys.mac.hardware'],'computer'),
# **dict.fromkeys( ['misc.forsale'],'sales'),
# **dict.fromkeys( ['rec.autos','rec.motorcycles'],'automobile'),
# **dict.fromkeys( ['sci.crypt','sci.electronics','sci.space'],'science'),
# **dict.fromkeys( ['sci.med'],'medicine') 
# }
# df['label']=df['label'].map(cat_dict)

label_encoder = LabelEncoder()  
# Encode labels in column 'species'.
df['target']= label_encoder.fit_transform(df['label'])

df = df.sample(frac = 1)
# dependent and independent variable
X = df['text']
y = df['target']

#_____________________________________________________________________________________________________________________________________________

In [3]:
df.head()

Unnamed: 0,text,label,target
6611,Greetings.\n\nI am developing an application t...,comp.windows.x,5
5469,: I'd appreciate any feedback on capture/playb...,comp.windows.x,5
6395,:\n:According to an Australian documentary mad...,talk.politics.guns,16
4408,\nOr perhaps any planning at all. :-) Hiya P...,rec.motorcycles,8
2027,"\n\tUh oh...\n\tUmm, there are a number of cop...",sci.electronics,12


# Preprocessing Pipeline

In [4]:
def spacy_preprocessing(text_format):
        
    def combine_text(list_of_text):
            combined_text = ' '.join(list_of_text)
            return combined_text
        
    doc  = nlp(text_format)
    tokens_list  = [ token.lemma_ for token in doc if not token.is_punct and not token.is_space and token.is_alpha]
    filter_token_sw = [token.lower() for token in tokens_list if token.lower() not in stopwords.words('english')]
   
    return combine_text(filter_token_sw)
    
vec_prop = np.vectorize(spacy_preprocessing)

pipe_spacy_preprocessing = FunctionTransformer(vec_prop)

class DenseTransformer(TransformerMixin):

    def fit(self, X, y=None, **fit_params):
        return self

    def transform(self, X, y=None, **fit_params):
        return X.todense()
    

prep_pipeline = Pipeline([
                    ('text_preprocessing', pipe_spacy_preprocessing ),
                    ('tfidf', TfidfVectorizer(use_idf = True, max_features=2500)),
                    ('sparse_to_dense',DenseTransformer()),
                    ('scaler', MaxAbsScaler())
                    
                    ])

# DEFINE LABELS IN OHE FORMAT
yc = tf.keras.utils.to_categorical(y,num_classes = 20,dtype=int )

In [None]:
#X_t = prep_pipeline.fit_transform(X).toarray()

# Model Pipeline

### Hyperparameters fine-tuning

In [5]:
from sklearn import set_config
set_config(display='diagram')

prep_pipeline

In [15]:
n_classes = len(set(label))
def create_model(optimizer="adam",dense_layer_sizes = False,
                 dropout=0.1, init='uniform',
                 features=2500,neurons=20,
                 n_classes = n_classes ):
    
    model = Sequential()
    model.add(Dense(neurons, activation='relu', input_shape=(features,), kernel_initializer=init,)) 
    
    #for layer_size in dense_layer_sizes:
    #   model.add(Dense(layer_size, activation='relu'))
    #   model.add(Dropout(dropout), )    
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=optimizer,
                  metrics=[tf.keras.metrics.AUC()])
    return model

kears_estimator = KerasClassifier(build_fn=create_model, verbose=1,batch_size=2**10,epochs = 15)

neurons=[10,50,100,300,500,1000]
epochs = [5,10,30,50]
batch_size = [2,4,10,20,50,100]

#dense_layer_sizes = [[] , []]

param_grid = dict(neurons=neurons, epochs = epochs, batch_size =batch_size)
StratifiedKFold(n_splits=2, shuffle=True)
grid = GridSearchCV(estimator=kears_estimator,
                    verbose=1,
                    cv=5,
                    param_grid=param_grid,scoring='roc_auc')

  kears_estimator = KerasClassifier(build_fn=create_model, verbose=1,batch_size=2**10,epochs = 15)


#### Grid Search Issue multilabel multiclass

In [None]:
# limit = 100
# X_p = prep_pipeline.fit_transform(X[:limit]).toarray()
# X_p.shape

In [None]:
# from sklearn.utils.multiclass import type_of_target
# type_of_target(y) , type_of_target(yc)

In [None]:
# grid_results = grid.fit(X_p,yc)
# results  =  pd.DataFrame(grid_results.cv_results_).sort_values('rank_test_score')
# results.head()

## Fit the Pipeline 

In [8]:
#model = create_model(neurons=20).fit(X_t,yc,epochs  = 15)

In [11]:
model_pipeline = Pipeline([
                    ('text_preprocessing', pipe_spacy_preprocessing ),
                    ('tfidf', TfidfVectorizer(use_idf = True, max_features=2500)),
                    ('sparse_to_dense',DenseTransformer()),
                    ('scaler', MaxAbsScaler()),('clf' ,kears_estimator )
])

In [12]:
model_pipeline

In [17]:
import time

t0 = time.time()
fitted_pipe = model_pipeline.fit(X,yc)
time.time() - t0 





833.0853395462036

# Testing Pipeline

In [None]:
dataset = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
df = pd.DataFrame()
df['text'] = dataset.data
df['source'] = dataset.target
label=[]
for i in df['source']:
    label.append(dataset.target_names[i])
df['label']=label
df.drop(['source'],axis=1,inplace=True)


# key_categories = ['politics','sport','religion','computer','sales','automobile','science','medicine']
# cat_dict = {
# **dict.fromkeys(['talk.politics.misc','talk.politics.guns','talk.politics.mideast'],'politics'),
# **dict.fromkeys( ['rec.sport.hockey','rec.sport.baseball'],'sport'),
# **dict.fromkeys( ['soc.religion.christian','talk.religion.misc'],'religion'),
# **dict.fromkeys(['comp.windows.x','comp.sys.ibm.pc.hardware','comp.os.ms-windows.misc','comp.graphics','comp.sys.mac.hardware'],'computer'),
# **dict.fromkeys( ['misc.forsale'],'sales'),
# **dict.fromkeys( ['rec.autos','rec.motorcycles'],'automobile'),
# **dict.fromkeys( ['sci.crypt','sci.electronics','sci.space'],'science'),
# **dict.fromkeys( ['sci.med'],'medicine') 
# }
# df['label']=df['label'].map(cat_dict)

label_encoder = LabelEncoder()  
# Encode labels in column 'species'.
df['target']= label_encoder.fit_transform(df['label'])

df = df.sample(frac = 1)
# dependent and independent variable
X_test = df['text']
y_test = df['label']
#_____________________________________________________________________________________________________________________________________________

In [None]:
y_pred  = fitted_pipe.predict(X)