# Microsoft Capstone Project
- Clean text (Tokenize, lower, remove stopwords, lemmentize, stem etc)
- Train 24 Topic Models for each topic label
- Do inference on larger sample to get probabilities as features
- CountVectorizer on words for features
- Use Average word length as feature
- length of document
- Scale Features
- Train Neural Network

"Your goal is to predict the topic(s) of publications from the World Bank, where there are 24 possible topics. <br>
You will be given the first six pages of text from each dociument. <br>Each document has at least one topic and can have multiple topics."

In [1]:
import re
import os
import nltk
import string
import gensim
import pickle
# import pyLDAvis
import numpy as np
import pandas as pd
# import pyLDAvis.gensim 
import matplotlib.pyplot as plt
import gensim.corpora as corpora
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from baggingPU import BaggingClassifierPU
from gensim.models.ldamodel import LdaModel
from sklearn.preprocessing import Normalizer
from gensim.corpora.dictionary import Dictionary
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

%matplotlib inline

pd.set_option('max_colwidth', 100)
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)



# **Preprocessing and Feature Engineering**

In [373]:
# Training data
TRAINING_DATA = '../wb-publications-data/train_values.csv'
TRAINING_LABEL = '../wb-publications-data/train_labels.csv'

In [374]:
df_values = pd.read_csv(TRAINING_DATA)

In [375]:
df_labels = pd.read_csv(TRAINING_LABEL)

### Concat dataframes

In [376]:
df_all = pd.concat([df_values, df_labels], axis=1)

In [377]:
# Downsample data to text out preprocessing
df = df_all[:10000].copy()

### Preprocess Text Data

In [378]:
def lemmatize_text(text):
     return [lemmatizer.lemmatize(word) for word in text]
        
def preprocess_text(df):
    exclude = set(string.punctuation)
    stop_words = set(stopwords.words('english'))
    stop_words.update(("develop", "develops", "developing", "busi", "business", "businesses", "program", "programs"))
    
    #Preprocess text column
    df['processed_text'] = df.doc_text.str.replace(r'\d+', '')                                  #Remove number
    df['processed_text'] = df.processed_text.str.lower()                                        #lower
    df['processed_text'] = df.processed_text.str.replace('http\S+|www.\S+', '', case=False)     #Remove website links
    df["processed_text"] = df.processed_text.str.replace('[{}]'.format(string.punctuation), '') #Remove all punctations

    df['processed_text'] = df.apply(lambda row: word_tokenize(row["processed_text"]), axis=1)
    df['processed_text'] = df.processed_text.apply(lambda x:[word.rstrip() for word in x])                       #Remove white spaces
    df['processed_text'] = df.processed_text.apply(lambda x:[word.replace('\n', ' ') for word in x])             #Remove literal blackslashes  
    df['processed_text'] = df.processed_text.apply(lambda x:[word.replace(r"\s\s+",' ') for word in x])          #Remove White spaces 
    df['processed_text'] = df.processed_text.apply(lambda x:[word.strip() for word in x if len(word) > 4])       #Remove spaces out words and acronyms

    #Remove stop words 
    df["topic_processed_text"] = df.processed_text.apply(lambda x: [word for word in x if word not in stop_words])
    df['topic_processed_text'] = df.topic_processed_text.apply(lambda x: [word for word in x if word not in exclude])
    
    #Lemmantize Words
    df['topic_processed_text'] = df.topic_processed_text.apply(lemmatize_text)
    
    #Stemming
    df['topic_processed_text'] = df.topic_processed_text.apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
    
    return df

In [379]:
df = preprocess_text(df)

In [380]:
df[['doc_text', 'processed_text', 'topic_processed_text']].head()

Unnamed: 0,doc_text,processed_text,topic_processed_text
0,"84327 v2\nThe findings, interpretations, and conclusions expressed in this report do not\nnecess...","[findings, interpretations, conclusions, expressed, report, necessarily, reflect, views, positio...","[find, interpret, conclus, express, report, necessarili, reflect, view, posit, execut, director,..."
1,...,"[decpg, daily, economics, financial, market, commentary, allen, dennis, sanket, mohapatra, riord...","[decpg, daili, econom, financi, market, commentari, allen, denni, sanket, mohapatra, riordan, yo..."
2,78156\n\n\n\n\nRisk Taking: A Corporate\nGovernance Perspective\nACKN...,"[taking, corporate, governance, perspective, acknowledgements, genesis, teaching, materials, fin...","[take, corpor, govern, perspect, acknowledg, genesi, teach, materi, final, product, effort, cont..."
3,WPS5836\n\n\nPolicy Research Working Paper ...,"[policy, research, working, paper, above, below, lending, hungary, banai, kirly, mrton, world, e...","[polici, research, work, paper, lend, hungari, banai, kirli, mrton, world, europ, central, regio..."
4,1 WPS39...,"[relative, importance, global, agricultural, subsidies, market, access, anderson, martin, ernest...","[rel, import, global, agricultur, subsidi, market, access, anderson, martin, ernesto, valenzuela..."


In [381]:
#Create names for each model based on label names
topics = []
for col in df_labels.columns:
    if col != 'row_id':
        topics.append(str(col))
topics = set(topics)

## Train Topic Models

In [11]:
models = {}
corpai = {}
dicts = {}
visualizations = {}

def create_bag_of_words(df):
    dictionary = corpora.Dictionary(df.topic_processed_text.values)
    dictionary.filter_extremes(no_below=5, keep_n=1000)
    corpus = [dictionary.doc2bow(text) for text in df.topic_processed_text.values]
    
    return dictionary, corpus


def create_models(corp, dic):
    return gensim.models.ldamodel.LdaModel(corp, num_topics = 3, id2word = dic, passes=20)


def top_terms_per_topic(lda_model):
    top_words_per_topic = []
    for t in range(lda_model.num_topics):
        top_words_per_topic.extend([(t, ) + x for x in lda_model.show_topic(t, topn = 10)])

    return pd.DataFrame(top_words_per_topic, columns=['Topic', 'Word', 'P'])


'''
Iterate over topic columns, create temp df frame and train LDA models 
Store them in dictionaries. 
'''
def train_topic_models(df):
    print('Begin Training Topic Models ')
    for category in topics:
        print('...{}'.format('Training topic model for '+ category))

        #Drop all useless labels
        tmp_df = df[df[category] == 1]
        tmp_df = tmp_df[[category, 'topic_processed_text']]
                
        print('...{}'.format(tmp_df[category].value_counts()))

        # Create Dictionary
        dictionary, corpus = create_bag_of_words(df)
        lda_model = create_models(corpus, dictionary)
        
        #Add models to dictionary
        models[category] = lda_model
        corpai[category] = corpus
        dicts[category] = dictionary
        

        #Save Models and top terms
        top_topics = top_terms_per_topic(lda_model) 
        top_topics.to_csv("./topic_models/top_terms_{}.csv".format(category))
        lda_model.save('./topic_binaries/lda_{}.model'.format(category))
        print('\n')

    print('Topic Reports Saved...')
    print('Topic Binary Saved...')
    return models, corpai, dicts


In [12]:
models, corpai, dicts = train_topic_models(df)

Begin Training Topic Models 
...Training topic model for infrastructure_economics_and_finance
...1    173
Name: infrastructure_economics_and_finance, dtype: int64


...Training topic model for public_sector_development
...1    1401
Name: public_sector_development, dtype: int64


...Training topic model for information_and_communication_technologies
...1    412
Name: information_and_communication_technologies, dtype: int64


...Training topic model for poverty_reduction
...1    1359
Name: poverty_reduction, dtype: int64


...Training topic model for social_development
...1    842
Name: social_development, dtype: int64


...Training topic model for conflict_and_development
...1    405
Name: conflict_and_development, dtype: int64


...Training topic model for governance
...1    881
Name: governance, dtype: int64


...Training topic model for energy_and_environment
...1    1707
Name: energy_and_environment, dtype: int64


...Training topic model for science_and_technology_development
...1 

In [13]:
# %matplotlib  inline 
# lda_visualization = pyLDAvis.gensim.prepare(models['education'], corpai['education'], dicts['education'], sort_topics=False)
# pyLDAvis.display(lda_visualization)

### Use topic models probs as features for all the data

In [382]:
def topic_vector(topic_probs_list):
    '''Extract topic probs from gensim data structure'''
    data = []
    for t in topic_probs_list:
        data.append([x[1] for x in t])
    return data

In [383]:
models.values()

dict_values([<gensim.models.ldamodel.LdaModel object at 0x7f16b9485710>, <gensim.models.ldamodel.LdaModel object at 0x7f16849381d0>, <gensim.models.ldamodel.LdaModel object at 0x7f1684938518>, <gensim.models.ldamodel.LdaModel object at 0x7f1684938400>, <gensim.models.ldamodel.LdaModel object at 0x7f1684938c88>, <gensim.models.ldamodel.LdaModel object at 0x7f1684939400>, <gensim.models.ldamodel.LdaModel object at 0x7f1684939710>, <gensim.models.ldamodel.LdaModel object at 0x7f16849391d0>, <gensim.models.ldamodel.LdaModel object at 0x7f1684939a20>, <gensim.models.ldamodel.LdaModel object at 0x7f168493a470>, <gensim.models.ldamodel.LdaModel object at 0x7f1684936ba8>, <gensim.models.ldamodel.LdaModel object at 0x7f1684936940>, <gensim.models.ldamodel.LdaModel object at 0x7f1684936e10>, <gensim.models.ldamodel.LdaModel object at 0x7f1684938be0>, <gensim.models.ldamodel.LdaModel object at 0x7f16ec4926a0>, <gensim.models.ldamodel.LdaModel object at 0x7f16b9485240>, <gensim.models.ldamodel.Lda

In [384]:
def fix_topics(res):

    if len(res[0]) != 3:
        zero_found = False
        one_found = False
        two_found = False
              
        for x, i in enumerate(res[0][:][:][:]):
            if int(i[0]) == 0:
                zero_found = True
                pass
            elif i[0] == 1:
                one_found = True
                pass
            elif i[0] == 2:
                two_found = True
                pass
            
        if zero_found == False:
            topic_prob = (0, 0.0)
            res[0].insert(0, topic_prob)

        if one_found == False:
            topic_prob = (1, 0.0)
            res[0].insert(1, topic_prob)

        if two_found == False:
            topic_prob = (2, 0.0)
            res[0].insert(2, topic_prob)
        return res
    return res

In [None]:
# for x in df[['topic_processed_text']][:].values:
#     ids_list = [models['water'].id2word.doc2bow(y) for y in x]
#     res = tp[ids_list]
# #     print(list(res))
#     res = fix_topics(list(res))
    
#     topic_data = np.array(topic_vector(res))
#     i_topic_data_df = pd.DataFrame(topic_data, columns=['topic_{}_{}'.format(name, i) for i in range(topic_data.shape[1])])

# #     print(topic_data.shape)


In [364]:
topics_list

[   topic_infrastructure_economics_and_finance_0  \
 0                                      0.378965   
 
    topic_infrastructure_economics_and_finance_1  \
 0                                      0.051961   
 
    topic_infrastructure_economics_and_finance_2  
 0                                      0.569074  ]

In [None]:
# get probs for all topics for all documents 
topics_list = []
for tp, name in zip(models.values(), models.keys()):
    print('...',name)   
    
    '''
    for every model:
        for each row
            get probs and convert to list
            put list into df
            add df to a list of dfs

    concat all the dfs in the lists into a single df (3)
    add new df to existing dataframe 
    
    '''
    for row in df[['topic_processed_text']].values:
        topic_data = np.array(topic_vector(fix_topics(list(tp[[tp.id2word.doc2bow(tokens) for tokens in row]]))))
    
    i_topic_data_df = pd.DataFrame(topic_data, columns=['topic_{}_{}'.format(name, i) for i in range(topic_data.shape[1])])    
    topics_list.append(i_topic_data_df)
    
topic_data_df = pd.concat(topics_list, axis=1)
df = pd.concat([df, topic_data_df], axis=1)
print('Probs added to dataframe')   


... infrastructure_economics_and_finance
... public_sector_development
... information_and_communication_technologies
... poverty_reduction
... social_development
... conflict_and_development
... governance
... energy_and_environment
... science_and_technology_development
... health_and_nutrition_and_population


In [None]:
df.head()

#### **Average word length as feature**

In [None]:
def get_average_word_len(text):
    #tokenize
    try:
        #Calculate the average len of the words
        sum_len = sum([len(x) for x in text])        
        return sum_len / len(text)
    except ZeroDivisionError:
        return 0

In [None]:
df['average_word_len'] = df.processed_text.apply(get_average_word_len)

#### **Word Count**

In [None]:
def get_word_count(text):
    return len(text)

In [None]:
df['word_count'] = df.processed_text.apply(get_word_count)

#### **Document length**

In [None]:
def get_doc_len(text):
    return len(text)

In [None]:
df['doc_len'] = df.doc_text.apply(get_doc_len)

#### **Capatilization Percentage**

In [None]:
def check_capatilization_perc(text): 
    text = text.replace(" ", "")
    try:
        is_upper = 0.0
        is_lower = 0.0
        for x in text:
            if x.isupper():
                is_upper+=1
            elif x.islower():
                is_lower+=1
        return ((is_upper / len(text)) * 100)
    except ZeroDivisionError:
        return 0

In [None]:
df['percentage_text_uppercase'] = df.doc_text.apply(check_capatilization_perc)

In [None]:
df.head(1)

#### **Count Vectorization**

In [None]:
df['process_vec'] = df.processed_text.apply(lambda x: ' '.join(x))
vec = CountVectorizer(stop_words='english', strip_accents='ascii', max_features=1000)
vec.fit(df.process_vec.values)

#### **Scale Numeric Features**

In [None]:
NUMERIC_COLS = ['doc_len', 'average_word_len', 'percentage_text_uppercase', 'word_count'] + list(topic_data_df.columns)
norm = Normalizer()
norm.fit(df[NUMERIC_COLS].fillna(0.0).replace([np.inf, -np.inf], 0.0).values)

In [None]:
#Save vectors
pickle.dump(vec, open(os.path.join('./countvec/', 'model_{}.pkl'.format('doc_text')), 'wb'))
pickle.dump(vec, open(os.path.join('./countvec/', 'model_{}.pkl'.format('norm')), 'wb'))

In [None]:
idx_all = np.arange(df.shape[0])
indices = {'train': idx_all}
datasets = {} 

In [None]:
#merge features into one set
for name, idx in indices.items():
    dftmp = df.iloc[idx].fillna(0.0).replace([np.inf, -np.inf], 0.0)
    
    data = dftmp[list(topics)].values
    
    a_f = norm.transform(dftmp[NUMERIC_COLS].values)
    c_vec = vec.transform(dftmp['process_vec'].fillna('').values).toarray()
    datasets[name] = np.hstack([data, a_f, c_vec])
    datasets[name][np.isinf(datasets[name])] = 0.0
    
dataset_cols = list(topics) + NUMERIC_COLS + ['token_' + t for t in list(vec.get_feature_names())]
train_cols = NUMERIC_COLS + ['token_' + t for t in list(vec.get_feature_names())]

print("Total features on dataset: ", len(dataset_cols))
print("Total training features: ", len(train_cols))

#ensure all features headings are of type string
dataset_cols = [str(x) for x in dataset_cols] 

In [None]:
# Save Dataset
for name, data in datasets.items():
        print('...', name)
        pd.DataFrame(data, columns=dataset_cols).to_parquet('./{}.parquet'.format(name), compression='gzip')

In [None]:
# del(df)
# del(df_values)
# del(df_labels)
# del(df_all)

# **Modelling**

In [None]:
df2 = pd.read_parquet('./train.parquet')

In [None]:
df2.head()

In [None]:
classifiers = {}

def train_model(df):
    
    print('Train Set Size: ' , (len(df) * .8))
    print('Test Set Size: ' , (len(df) * .2))
    
    for label in list(topics):
        print('------- {} -------'.format('Training RF for ' + label))
        X = df[train_cols]
        y = df[label]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019, stratify=y)
        
        clf = BaggingClassifierPU(RandomForestClassifier(n_estimators=30, max_depth=4, random_state=2019),
                                  n_estimators=50, n_jobs=-1, max_samples=list(y_train).count(1))
        clf.fit(X_train, y_train)
        # Save model to local directory
        with open('./classifiers/' + label + '.pkl', 'wb') as fl:
            pickle.dump(clf, fl)
         
        classifiers[label] = clf
        
        y_pred = clf.predict(X_test)
        print('...F1 Score:', f1_score(y_test, y_pred, average='micro'))
        
train_model(df2)

In [None]:
# df2[train_cols]

In [None]:
# del(df2)

### **Evaluation**

In [None]:
TEST_DATA = '../wb-publications-data/test_values.csv'

In [None]:
df_test = pd.read_csv(TEST_DATA)

In [None]:
df_test.head()

### Preprocess Validation Set


In [None]:
# get probs for all topics for all documents 
topics_list_v = []
for tp, name in zip(models.values(), models.keys()):
    
    print('...',name)
    topic_data = np.array(topic_vector(tp[[tp.id2word.doc2bow(tokens) for tokens in df.topic_processed_text.values]]))

#     print(topic_data.shape)

    i_topic_data_df = pd.DataFrame(topic_data, 
                                   columns=['topic_{}_{}'.format(name, i) for i in range(topic_data.shape[1])])
    topics_list_v.append(i_topic_data_df)

topic_data_df_v = pd.concat(topics_list_v, axis=1)
df_test = pd.concat([df_test.reset_index(), topic_data_df_v], axis=1).drop(columns=['index'])

In [None]:
df_test = preprocess_text(df_test)
print('Done')

In [None]:
df_test['average_word_len'] = df_test.processed_text.apply(get_average_word_len)
print('Done')

In [None]:
df_test['word_count'] = df_test.processed_text.apply(get_word_count)
print('Done')

In [None]:
df_test['doc_len'] = df_test.doc_text.apply(get_doc_len)
print('Done')

In [None]:
df_test['percentage_text_uppercase'] = df_test.doc_text.apply(check_capatilization_perc)
print('Done')

In [None]:
df_test.info()

In [None]:
df_test['process_vec'] = df_test.processed_text.apply(lambda x: ' '.join(x))

In [None]:
df_test.head(2)

In [None]:
idxv_all = np.arange(df_test.shape[0])
indices = {'validation': idxv_all}
datasets = {}

for name, idxv in indices.items():
    #merge features into one set
    dfvtmp = df_test.iloc[idxv].fillna(0.0).replace([np.inf, -np.inf], 0.0)            
    a_f_v = norm.transform(dfvtmp[NUMERIC_COLS].values)
    c_vec_v = vec.transform(dfvtmp['process_vec'].fillna('').values).toarray()
    datasets[name] = np.hstack([a_f_v, c_vec_v])
    datasets[name][np.isinf(datasets[name])] = 0.0
    
dataset_v_cols = NUMERIC_COLS + ['token_' + t for t in list(vec.get_feature_names())]
dataset_v_cols = [str(x) for x in dataset_v_cols] 

print(len(dataset_v_cols))
pd.DataFrame(datasets['validation'], columns=dataset_v_cols).to_parquet('./{}.parquet'.format('validation'), compression='gzip')

In [None]:
df_val = pd.read_parquet('./validation.parquet')

In [None]:
df_test.head()

In [None]:
df_val.head()

In [None]:
out_cols = ['row_id',
            'information_and_communication_technologies',
            'governance',
            'urban_development',
            'law_and_development',
            'public_sector_development',
            'agriculture',
            'communities_and_human_settlements',
            'health_and_nutrition_and_population',
            'culture_and_development',
            'social_protections_and_labor',
            'international_economics_and_trade',
            'conflict_and_development',
            'science_and_technology_development',
            'rural_development',
            'poverty_reduction',
            'social_development',
            'education',
            'transport',
            'gender',
            'infrastructure_economics_and_finance',
            'energy_and_environment',
            'finance_and_development',
            'macroeconomics_and_growth', 
            'water'
           ]

output_df = pd.DataFrame(columns=out_cols)

In [None]:
topics

In [None]:
submission_df = pd.read_csv('../wb-publications-data/submission_format.csv')

In [None]:
df_val.info()

In [None]:
submission_df.info()

In [None]:
for topic in submission_df.columns:
    if topic in topics:       
        with open('./classifiers/' + topic + '.pkl', 'rb') as pickle_file:
            forest_model = pickle.load(pickle_file)
        submission_df[topic] = forest_model.predict(df_val[:])
        

In [None]:
submission_df.head(10)

In [None]:
submission_df = submission_df.astype(int)

In [None]:
submission_df.to_csv('submission.csv', index=False)