In [None]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import os
from os.path import isfile, join
import string
import re
from string import punctuation
import sys

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.tokenize import RegexpTokenizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer

In [None]:
root_w ='C:/Users/david/Desktop/'
news_folder = '20_newsgroup/'
my_path  = root_w+news_folder 

#List of folder names to make valid pathnames later
folders = [f for f in os.listdir(my_path)]
#2D list to store list of all files in different folders

files = []
for folder_name in folders:
    folder_path = join(my_path, folder_name)
    files.append([f for f in os.listdir(folder_path)])
    
#Create a list to each document
pathname_list = []
for fo in range(len(folders)):
    for fi in files[fo]:
        pathname_list.append(join(my_path, join(folders[fo], fi)))
        
#Create Target for each documents
Y = []
for folder_name in folders:
    folder_path = join(my_path, folder_name)
    num_of_files= len(os.listdir(folder_path))
    for i in range(num_of_files):
        Y.append(folder_name)
        
from sklearn.model_selection import train_test_split
doc_train, doc_test, Y_train, Y_test = train_test_split(pathname_list, Y, random_state=0, test_size=0.2)

In [None]:
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
df = pd.DataFrame()
df['text'] = dataset.data
df['source'] = dataset.target
label=[]
for i in df['source']:
    label.append(dataset.target_names[i])
df['label']=label
df.drop(['source'],axis=1,inplace=True)


key_categories = ['politics','sport','religion','computer','sales','automobile','science','medicine']
cat_dict = {
**dict.fromkeys(['talk.politics.misc','talk.politics.guns','talk.politics.mideast'],'politics'),
**dict.fromkeys( ['rec.sport.hockey','rec.sport.baseball'],'sport'),
**dict.fromkeys( ['soc.religion.christian','talk.religion.misc'],'religion'),
**dict.fromkeys(['comp.windows.x','comp.sys.ibm.pc.hardware','comp.os.ms-windows.misc','comp.graphics','comp.sys.mac.hardware'],'computer'),
**dict.fromkeys( ['misc.forsale'],'sales'),
**dict.fromkeys( ['rec.autos','rec.motorcycles'],'automobile'),
**dict.fromkeys( ['sci.crypt','sci.electronics','sci.space'],'science'),
**dict.fromkeys( ['sci.med'],'medicine') 
}

df['label']=df['label'].map(cat_dict)



df['Number_of_words'] = df['text'].apply(lambda x:len(str(x).split()))


no_text = df[df['Number_of_words']==0]
print('No text records: ',len(no_text))

# drop these rows
df.drop(no_text.index,inplace=True)

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

df['cleaned_text'] = df['text'].apply(lambda x: clean_text(x))


tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
df['tokens'] = df['cleaned_text'].apply(lambda x:tokenizer.tokenize(x))


def remove_stopwords(text):
    custom_no_contest = ['subject:','from:', 'date:', 'newsgroups:', 'message-id:', 'lines:', 'path:', 'organization:', 
                        'would', 'writes:', 'references:', 'article', 'sender:', 'nntp-posting-host:', 'people', 
                        'university', 'think', 'xref:', 'cantaloupe.srv.cs.cmu.edu', 'could', 'distribution:', 'first', 
                        'anyone','world', 'really', 'since', 'right', 'believe', 'still', 
                        "max>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'ax>'"]
    stopWords = stopwords.words('english')
    stopWords+=custom_no_contest
    words = [w for w in text if w not in stopWords ]
    return words 

df['stopwordremove_tokens'] = df['tokens'].apply(lambda x : remove_stopwords(x))

lem = WordNetLemmatizer()
def lem_word(x):
    return [lem.lemmatize(w) for w in x]

df['lemmatized_text'] = df['stopwordremove_tokens'].apply(lem_word)


def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

df['final_text'] = df['lemmatized_text'].apply(lambda x : combine_text(x))



label_encoder = LabelEncoder()
  
# Encode labels in column 'species'.
df['target']= label_encoder.fit_transform(df['label'])


# dependent and independent variable
X = df['final_text']
y = df['target']


tfidf_vectorizer = TfidfVectorizer(min_df = 2,max_df = 0.5,ngram_range = (1,2))
tfidf = tfidf_vectorizer.fit_transform(X)

In [None]:
def text_preprocessing(text):
    doc = text
    tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
    tokens  = tokenizer.tokenize(text)
    tokens = [token for token in tokens  if len(token)> 3 and len(token) < 10]
    clean_tokens = remove_stopwords(tokens)
    nlp_text = combine_text(clean_tokens)
    return nlp_text


def spacy_preprocessing(text_format):
    # Handle multiple input str pandas array
    if not isinstance(text_format,str):
        doc  = nlp(text_format.item())
    else:
        doc  = nlp(text_format)
        
    tokens_list  = [ token.lemma_ for token in doc if not token.is_punct and not token.is_space and token.is_alpha]
    filter_token_sw = [token for token in tokens_list if token not in stopwords.words('english')]
    return combine_text(filter_token_sw)

pipe_spacy_preprocessing = FunctionTransformer(spacy_preprocessing)

In [None]:
a = df.sample(1).text.item()
text_preprocessing(a)

In [None]:
spacy_preprocessing(a)

In [98]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import os
from os.path import isfile, join
import string
import re
from string import punctuation
import sys

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
import spacy
nlp = spacy.load('en_core_web_sm')
from nltk.tokenize import RegexpTokenizer

from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.feature_extraction.text import CountVectorizer ,TfidfVectorizer

from sklearn.linear_model import LogisticRegression

from scikeras.wrappers import KerasClassifier
from keras.layers import Dense, Input, Dropout
from keras import Sequential


def vectorized_spacy_preprocessing(data):
    
    def spacy_preprocessing(text_format):
        
        def combine_text(list_of_text):
            combined_text = ' '.join(list_of_text)
            return combined_text

        doc  = nlp(text_format)

        tokens_list  = [ token.lemma_ for token in doc if not token.is_punct and not token.is_space and token.is_alpha]
        filter_token_sw = [token for token in tokens_list if token not in stopwords.words('english')]
        #print('Done')
        return combine_text(filter_token_sw)

    return list(map(spacy_preprocessing,data))



#______________________________________________________ DATA INGESTION___________________________________________________________________
dataset = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
df = pd.DataFrame()
df['text'] = dataset.data
df['source'] = dataset.target
label=[]
for i in df['source']:
    label.append(dataset.target_names[i])
df['label']=label
df.drop(['source'],axis=1,inplace=True)


# key_categories = ['politics','sport','religion','computer','sales','automobile','science','medicine']
# cat_dict = {
# **dict.fromkeys(['talk.politics.misc','talk.politics.guns','talk.politics.mideast'],'politics'),
# **dict.fromkeys( ['rec.sport.hockey','rec.sport.baseball'],'sport'),
# **dict.fromkeys( ['soc.religion.christian','talk.religion.misc'],'religion'),
# **dict.fromkeys(['comp.windows.x','comp.sys.ibm.pc.hardware','comp.os.ms-windows.misc','comp.graphics','comp.sys.mac.hardware'],'computer'),
# **dict.fromkeys( ['misc.forsale'],'sales'),
# **dict.fromkeys( ['rec.autos','rec.motorcycles'],'automobile'),
# **dict.fromkeys( ['sci.crypt','sci.electronics','sci.space'],'science'),
# **dict.fromkeys( ['sci.med'],'medicine') 
# }
# df['label']=df['label'].map(cat_dict)

label_encoder = LabelEncoder()  
# Encode labels in column 'species'.
df['target']= label_encoder.fit_transform(df['label'])


# dependent and independent variable
X = df['text']
y = df['target']

#_____________________________________________________________________________________________________________________________________________

### Model Creation

In [95]:
def create_model(optimizer="adam", dropout=0.1, init='uniform', nbr_features=2500, dense_nparams=256,n_classes = len(set(label) ) ):
    model = Sequential()
    model.add(Dense(dense_nparams, activation='relu', input_shape=(nbr_features,), kernel_initializer=init,)) 
    model.add(Dropout(dropout), )
    model.add(Dense(n_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer,metrics=[metrics.AUC])
    return model

kears_estimator = KerasClassifier(build_fn=create_model, verbose=1)

clf = KerasClassifier(build_fn=create_model, verbose=1)

In [96]:
parameters = [{
  'tfidf__ngram_range': [(1,1), (1,2), (2,2), (1,3)],
  'tfidf__use_idf': [True, False],
}]


pipe_spacy_preprocessing = FunctionTransformer(vectorized_spacy_preprocessing)

pipeline = Pipeline([
                    ('text_preprocessing', pipe_spacy_preprocessing ),
                    ('tfidf', TfidfVectorizer()),
                    ('classifier',clf)
                    ])



grid_search = GridSearchCV(pipeline, parameters, scoring=make_scorer(f1_score , average='micro') , cv=StratifiedKFold(n_splits=2))

In [None]:
import time
t0 = time.time()

grid_results  = grid_search.fit(X[:10].to,y[:10])

time.time() - t0

In [None]:
results  =  pd.DataFrame(grid_results.cv_results_).sort_values('rank_test_score')
results.head()

In [86]:
grid_results.best_estimator_

Pipeline(steps=[('text_preprocessing',
                 FunctionTransformer(func=<function vectorized_spacy_preprocessing at 0x000002AF515DEDC0>)),
                ('tfidf', TfidfVectorizer()),
                ('classifier', LogisticRegression())])

In [81]:
from keras import metrics