# Microsoft Capstone Project - Notebook 3
- Clean text (Tokenize, lower, remove stopwords, lemmentize, stem etc)
- CountVectorizer on 10000 words for features
- Use Average word length as feature
- length of document
- Scale Features
- Sentiment and subjectivity of document text
- Do not downsample training data 
- Train models for each topic:
    - Linear SVC
    - Logistic Regression 

"Your goal is to predict the topic(s) of publications from the World Bank, where there are 24 possible topics. <br>
You will be given the first six pages of text from each dociument. <br>Each document has at least one topic and can have multiple topics."

In [1]:
import re
import os
import nltk
import string
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gensim.corpora as corpora
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from nltk.tokenize import word_tokenize
from gensim.models.ldamodel import LdaModel
from sklearn.preprocessing import Normalizer
from gensim.corpora.dictionary import Dictionary
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer,PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

%matplotlib inline

pd.set_option('max_colwidth', 100)
pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)

# **Preprocessing and Feature Engineering**

In [2]:
# Training data
TRAINING_DATA = '../../wb-publications-data/train_values.csv'
TRAINING_LABEL = '../../wb-publications-data/train_labels.csv'

In [3]:
df_values = pd.read_csv(TRAINING_DATA)

In [4]:
df_labels = pd.read_csv(TRAINING_LABEL)

### Concat dataframes

In [5]:
df_all = pd.concat([df_values, df_labels], axis=1)

In [6]:
# Downsample data to text out preprocessing
df = df_all[:].copy()

### Preprocess Text Data

In [7]:
def lemmatize_text(text):
     return [lemmatizer.lemmatize(word) for word in text]
        
def preprocess_text(df):
    exclude = set(string.punctuation)
    stop_words = set(stopwords.words('english'))
    stop_words.update(("develop", "develops", "developing", "busi", "business", "businesses", "program", "programs"))
    
    #Preprocess text column
    df['processed_text'] = df.doc_text.str.replace(r'\d+', '')                                  #Remove number
    df['processed_text'] = df.processed_text.str.lower()                                        #lower
    df['processed_text'] = df.processed_text.str.replace('http\S+|www.\S+', '', case=False)     #Remove website links
    df["processed_text"] = df.processed_text.str.replace('[{}]'.format(string.punctuation), '') #Remove all punctations

    df['processed_text'] = df.apply(lambda row: word_tokenize(row["processed_text"]), axis=1)
    df['processed_text'] = df.processed_text.apply(lambda x:[word.rstrip() for word in x])                       #Remove white spaces
    df['processed_text'] = df.processed_text.apply(lambda x:[word.replace('\n', ' ') for word in x])             #Remove literal blackslashes  
    df['processed_text'] = df.processed_text.apply(lambda x:[word.replace(r"\s\s+",' ') for word in x])          #Remove White spaces 
    df['processed_text'] = df.processed_text.apply(lambda x:[word.strip() for word in x if len(word) > 4])       #Remove spaces out words and acronyms

    #Remove stop words 
    df["topic_processed_text"] = df.processed_text.apply(lambda x: [word for word in x if word not in stop_words])
    df['topic_processed_text'] = df.topic_processed_text.apply(lambda x: [word for word in x if word not in exclude])
    
    #Lemmantize Words
    df['topic_processed_text'] = df.topic_processed_text.apply(lemmatize_text)
    
    #Stemming
    df['topic_processed_text'] = df.topic_processed_text.apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
    
    return df

In [8]:
df = preprocess_text(df)

In [9]:
df[['doc_text', 'processed_text', 'topic_processed_text']].head()

Unnamed: 0,doc_text,processed_text,topic_processed_text
0,"84327 v2\nThe findings, interpretations, and conclusions expressed in this report do not\nnecess...","[findings, interpretations, conclusions, expressed, report, necessarily, reflect, views, positio...","[find, interpret, conclus, express, report, necessarili, reflect, view, posit, execut, director,..."
1,...,"[decpg, daily, economics, financial, market, commentary, allen, dennis, sanket, mohapatra, riord...","[decpg, daili, econom, financi, market, commentari, allen, denni, sanket, mohapatra, riordan, yo..."
2,78156\n\n\n\n\nRisk Taking: A Corporate\nGovernance Perspective\nACKN...,"[taking, corporate, governance, perspective, acknowledgements, genesis, teaching, materials, fin...","[take, corpor, govern, perspect, acknowledg, genesi, teach, materi, final, product, effort, cont..."
3,WPS5836\n\n\nPolicy Research Working Paper ...,"[policy, research, working, paper, above, below, lending, hungary, banai, kirly, mrton, world, e...","[polici, research, work, paper, lend, hungari, banai, kirli, mrton, world, europ, central, regio..."
4,1 WPS39...,"[relative, importance, global, agricultural, subsidies, market, access, anderson, martin, ernest...","[rel, import, global, agricultur, subsidi, market, access, anderson, martin, ernesto, valenzuela..."


In [10]:
#Create names for each model based on label names
topics = []
for col in df_labels.columns:
    if col != 'row_id':
        topics.append(str(col))
topics = set(topics)

In [11]:
df.head()

Unnamed: 0,row_id,doc_text,row_id.1,information_and_communication_technologies,governance,urban_development,law_and_development,public_sector_development,agriculture,communities_and_human_settlements,health_and_nutrition_and_population,culture_and_development,social_protections_and_labor,international_economics_and_trade,conflict_and_development,science_and_technology_development,rural_development,poverty_reduction,social_development,education,transport,gender,infrastructure_economics_and_finance,energy_and_environment,finance_and_development,macroeconomics_and_growth,water,processed_text,topic_processed_text
0,0,"84327 v2\nThe findings, interpretations, and conclusions expressed in this report do not\nnecess...",0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,"[findings, interpretations, conclusions, expressed, report, necessarily, reflect, views, positio...","[find, interpret, conclus, express, report, necessarili, reflect, view, posit, execut, director,..."
1,1,...,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,"[decpg, daily, economics, financial, market, commentary, allen, dennis, sanket, mohapatra, riord...","[decpg, daili, econom, financi, market, commentari, allen, denni, sanket, mohapatra, riordan, yo..."
2,2,78156\n\n\n\n\nRisk Taking: A Corporate\nGovernance Perspective\nACKN...,2,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,"[taking, corporate, governance, perspective, acknowledgements, genesis, teaching, materials, fin...","[take, corpor, govern, perspect, acknowledg, genesi, teach, materi, final, product, effort, cont..."
3,3,WPS5836\n\n\nPolicy Research Working Paper ...,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,"[policy, research, working, paper, above, below, lending, hungary, banai, kirly, mrton, world, e...","[polici, research, work, paper, lend, hungari, banai, kirli, mrton, world, europ, central, regio..."
4,4,1 WPS39...,4,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,"[relative, importance, global, agricultural, subsidies, market, access, anderson, martin, ernest...","[rel, import, global, agricultur, subsidi, market, access, anderson, martin, ernesto, valenzuela..."


#### **Average word length as feature**

In [12]:
def get_average_word_len(text):
    #tokenize
    try:
        #Calculate the average len of the words
        sum_len = sum([len(x) for x in text])        
        return sum_len / len(text)
    except ZeroDivisionError:
        return 0

In [13]:
df['average_word_len'] = df.processed_text.apply(get_average_word_len)

#### **Word Count**

In [14]:
def get_word_count(text):
    return len(text)

In [15]:
df['word_count'] = df.processed_text.apply(get_word_count)

#### **Document length**

In [16]:
def get_doc_len(text):
    return len(text)

In [17]:
df['doc_len'] = df.doc_text.apply(get_doc_len)

#### **Capatilization Percentage**

In [18]:
def check_capatilization_perc(text): 
    text = text.replace(" ", "")
    try:
        is_upper = 0.0
        is_lower = 0.0
        for x in text:
            if x.isupper():
                is_upper+=1
            elif x.islower():
                is_lower+=1
        return ((is_upper / len(text)) * 100)
    except ZeroDivisionError:
        return 0

In [19]:
df['percentage_text_uppercase'] = df.doc_text.apply(check_capatilization_perc)

In [20]:
df.head(1)

Unnamed: 0,row_id,doc_text,row_id.1,information_and_communication_technologies,governance,urban_development,law_and_development,public_sector_development,agriculture,communities_and_human_settlements,health_and_nutrition_and_population,culture_and_development,social_protections_and_labor,international_economics_and_trade,conflict_and_development,science_and_technology_development,rural_development,poverty_reduction,social_development,education,transport,gender,infrastructure_economics_and_finance,energy_and_environment,finance_and_development,macroeconomics_and_growth,water,processed_text,topic_processed_text,average_word_len,word_count,doc_len,percentage_text_uppercase
0,0,"84327 v2\nThe findings, interpretations, and conclusions expressed in this report do not\nnecess...",0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,"[findings, interpretations, conclusions, expressed, report, necessarily, reflect, views, positio...","[find, interpret, conclus, express, report, necessarili, reflect, view, posit, execut, director,...",7.622807,342,19820,5.309182


In [21]:
from textblob import TextBlob
def get_polarity_sent(text):
    try:
        text = TextBlob(text)
        score = (text.sentiment.polarity + 1) / 2 #Bring score in a range between 0 and 1 
        return score
    except:
        return 0.5 #Return a neutral response

def get_subjectivity_sent(text):
    try:
        text = TextBlob(text)
        return text.sentiment.subjectivity
    except:
        return 0.5


In [22]:
df['process_vec'] = df.processed_text.apply(lambda x: ' '.join(x)) #Possible change this to lemmentize text

In [23]:
df['sentiment_polarity'] = df.doc_text.apply(get_polarity_sent)
df['sentiment_subjectivity'] = df.doc_text.apply(get_subjectivity_sent)

#### **Count Vectorization**

In [24]:
vec = CountVectorizer(stop_words='english', strip_accents='ascii', max_features=10000)
vec.fit(df.process_vec.values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=10000, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words='english',
                strip_accents='ascii', token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

#### **Scale Numeric Features**

In [25]:
NUMERIC_COLS = ['doc_len', 'average_word_len', 'percentage_text_uppercase', 'word_count', 'sentiment_polarity', 'sentiment_subjectivity']
norm = Normalizer()
norm.fit(df[NUMERIC_COLS].fillna(0.0).replace([np.inf, -np.inf], 0.0).values)

Normalizer(copy=True, norm='l2')

In [26]:
#Save vectors
pickle.dump(vec, open(os.path.join('./countvec/', 'model_{}.pkl'.format('doc_text')), 'wb'))
pickle.dump(vec, open(os.path.join('./countvec/', 'model_{}.pkl'.format('norm')), 'wb'))

In [27]:
idx_all = np.arange(df.shape[0])
indices = {'train': idx_all}
datasets = {} 

In [28]:
#merge features into one set
for name, idx in indices.items():
    dftmp = df.iloc[idx].fillna(0.0).replace([np.inf, -np.inf], 0.0)
    
    data = dftmp[list(topics)].values
    
    a_f = norm.transform(dftmp[NUMERIC_COLS].values)
    c_vec = vec.transform(dftmp['process_vec'].fillna('').values).toarray()
    datasets[name] = np.hstack([data, a_f, c_vec])
    datasets[name][np.isinf(datasets[name])] = 0.0
    
dataset_cols = list(topics) + NUMERIC_COLS + ['token_' + t for t in list(vec.get_feature_names())]
train_cols = NUMERIC_COLS + ['token_' + t for t in list(vec.get_feature_names())]

print("Total features on dataset: ", len(dataset_cols))
print("Total training features: ", len(train_cols))

#ensure all features headings are of type string
dataset_cols = [str(x) for x in dataset_cols] 

Total features on dataset:  10030
Total training features:  10006


In [29]:
# Save Dataset
for name, data in datasets.items():
        print('...', name)
        pd.DataFrame(data, columns=dataset_cols).to_parquet('./{}.parquet'.format(name), compression='gzip')

... train


# **Modelling**

In [30]:
df2 = pd.read_parquet('./train.parquet')

In [31]:
df2.head()

Unnamed: 0,water,macroeconomics_and_growth,agriculture,law_and_development,infrastructure_economics_and_finance,health_and_nutrition_and_population,international_economics_and_trade,communities_and_human_settlements,public_sector_development,culture_and_development,energy_and_environment,science_and_technology_development,gender,conflict_and_development,governance,education,information_and_communication_technologies,social_protections_and_labor,social_development,urban_development,transport,rural_development,finance_and_development,poverty_reduction,doc_len,...,token_youre,token_youth,token_youths,token_yugoslav,token_yugoslavia,token_yunnan,token_yusuf,token_zaidi,token_zaman,token_zambia,token_zambian,token_zambias,token_zanzibar,token_zation,token_zealand,token_zeroduty,token_zhang,token_zimbabwe,token_zimbabwes,token_zingales,token_zoellick,token_zones,token_zoning,token_zscore,token_zusammenarbeit
0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.999851,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.9973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.998429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.998509,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.997472,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### SVC 


In [32]:
def train_svc_model(df):
    
    print('Train Set Size: ' , (len(df) * .8))
    print('Test Set Size: ' , (len(df) * .2))
                  
    
    for label in list(topics):
        print('------- {} -------'.format('Training SVC for ' + label))
        X = df[train_cols]
        y = df[label]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019, stratify=y)

        svc = OneVsRestClassifier(LinearSVC(), n_jobs=-1).fit(X_train, y_train)
        # Save model to local directory
        with open('./svc_classifiers/' + label + '.pkl', 'wb') as fl:
            pickle.dump(svc, fl)
             
        y_pred = svc.predict(X_test)
        print('...F1 Score:', f1_score(y_test, y_pred, average='micro'))
        
train_svc_model(df2)  

Train Set Size:  14949.6
Test Set Size:  3737.4
------- Training SVC for water -------
...F1 Score: 0.920545746388443
------- Training SVC for macroeconomics_and_growth -------
...F1 Score: 0.6958266452648475
------- Training SVC for agriculture -------
...F1 Score: 0.9253611556982343
------- Training SVC for law_and_development -------
...F1 Score: 0.8394863563402889
------- Training SVC for infrastructure_economics_and_finance -------
...F1 Score: 0.9839486356340289
------- Training SVC for health_and_nutrition_and_population -------
...F1 Score: 0.8352059925093633
------- Training SVC for international_economics_and_trade -------
...F1 Score: 0.8849652220438737
------- Training SVC for communities_and_human_settlements -------
...F1 Score: 0.920813269127876
------- Training SVC for public_sector_development -------
...F1 Score: 0.8191546281433922
------- Training SVC for culture_and_development -------
...F1 Score: 0.9745853397538791
------- Training SVC for energy_and_environment -

### Logistic Regressions 

In [33]:
def train_lr_model(df):
    
    print('Train Set Size: ' , (len(df) * .8))
    print('Test Set Size: ' , (len(df) * .2))
                  
    for label in list(topics):
        print('------- {} -------'.format('Training Linear Regression for ' + label))
        X = df[train_cols]
        y = df[label]
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2019, stratify=y)

        lr = OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1).fit(X_train, y_train)
        # Save model to local directory
        with open('./lr_classifier/' + label + '.pkl', 'wb') as fl:
            pickle.dump(lr, fl)
             
        y_pred = lr.predict(X_test)
        print('...F1 Score:', f1_score(y_test, y_pred, average='micro'))
        
train_lr_model(df2)  

Train Set Size:  14949.6
Test Set Size:  3737.4
------- Training Linear Regression for water -------
...F1 Score: 0.9590690208667737
------- Training Linear Regression for macroeconomics_and_growth -------
...F1 Score: 0.7723381487426432
------- Training Linear Regression for agriculture -------
...F1 Score: 0.9531835205992509
------- Training Linear Regression for law_and_development -------
...F1 Score: 0.898876404494382
------- Training Linear Regression for infrastructure_economics_and_finance -------




...F1 Score: 0.9884965222043873
------- Training Linear Regression for health_and_nutrition_and_population -------
...F1 Score: 0.8943285179240236
------- Training Linear Regression for international_economics_and_trade -------
...F1 Score: 0.9200107009095773
------- Training Linear Regression for communities_and_human_settlements -------
...F1 Score: 0.9430176565008026
------- Training Linear Regression for public_sector_development -------
...F1 Score: 0.8908507223113965
------- Training Linear Regression for culture_and_development -------




...F1 Score: 0.9839486356340289
------- Training Linear Regression for energy_and_environment -------
...F1 Score: 0.9004815409309791
------- Training Linear Regression for science_and_technology_development -------
...F1 Score: 0.9614767255216693
------- Training Linear Regression for gender -------
...F1 Score: 0.9652220438737292
------- Training Linear Regression for conflict_and_development -------
...F1 Score: 0.9665596575708936
------- Training Linear Regression for governance -------
...F1 Score: 0.9237560192616372
------- Training Linear Regression for education -------
...F1 Score: 0.9309791332263242
------- Training Linear Regression for information_and_communication_technologies -------
...F1 Score: 0.9612092027822365
------- Training Linear Regression for social_protections_and_labor -------
...F1 Score: 0.904226859283039
------- Training Linear Regression for social_development -------
...F1 Score: 0.9288389513108615
------- Training Linear Regression for urban_development

## **Evaluation**

In [34]:
TEST_DATA = '../../wb-publications-data/test_values.csv'

In [35]:
df_test = pd.read_csv(TEST_DATA)

In [36]:
df_test.head()

Unnamed: 0,row_id,doc_text
0,0,...
1,1,EARLY LEARNING PARTNERSHIP\n\n\n\n\n E L P\n ...
2,2,WPS5739\n\n\nPolicy Research Working Paper...
3,3,WPS7840\n\n\nPolicy Research Working Paper ...
4,4,...


### Preprocess Validation Set


In [37]:
df_test = preprocess_text(df_test)
print('Done')

Done


In [38]:
df_test['average_word_len'] = df_test.processed_text.apply(get_average_word_len)
print('Done')

Done


In [39]:
df_test['word_count'] = df_test.processed_text.apply(get_word_count)
print('Done')

Done


In [40]:
df_test['doc_len'] = df_test.doc_text.apply(get_doc_len)
print('Done')

Done


In [41]:
df_test['percentage_text_uppercase'] = df_test.doc_text.apply(check_capatilization_perc)
print('Done')

Done


In [42]:
df_test['process_vec'] = df_test.processed_text.apply(lambda x: ' '.join(x))

In [43]:
df_test['sentiment_polarity'] = df_test.doc_text.apply(get_polarity_sent)
df_test['sentiment_subjectivity'] = df_test.doc_text.apply(get_subjectivity_sent)


In [44]:
idxv_all = np.arange(df_test.shape[0])
indices = {'validation': idxv_all}
datasets = {}

for name, idxv in indices.items():
    #merge features into one set
    dfvtmp = df_test.iloc[idxv].fillna(0.0).replace([np.inf, -np.inf], 0.0)            
    a_f_v = norm.transform(dfvtmp[NUMERIC_COLS].values)
    c_vec_v = vec.transform(dfvtmp['process_vec'].fillna('').values).toarray()
    datasets[name] = np.hstack([a_f_v, c_vec_v])
    datasets[name][np.isinf(datasets[name])] = 0.0
    
dataset_v_cols = NUMERIC_COLS + ['token_' + t for t in list(vec.get_feature_names())]
dataset_v_cols = [str(x) for x in dataset_v_cols] 

print(len(dataset_v_cols))
pd.DataFrame(datasets['validation'], columns=dataset_v_cols).to_parquet('./{}.parquet'.format('validation'), compression='gzip')

10006


In [45]:
df_val = pd.read_parquet('./validation.parquet')

In [46]:
df_val.head()

Unnamed: 0,doc_len,average_word_len,percentage_text_uppercase,word_count,sentiment_polarity,sentiment_subjectivity,token_aaditya,token_ababa,token_abandoned,token_abatement,token_abbreviations,token_abdul,token_abidjan,token_abilities,token_ability,token_abolition,token_abortion,token_abovementioned,token_abreast,token_abridged,token_abroad,token_abrupt,token_absence,token_absent,token_absenteeism,...,token_youre,token_youth,token_youths,token_yugoslav,token_yugoslavia,token_yunnan,token_yusuf,token_zaidi,token_zaman,token_zambia,token_zambian,token_zambias,token_zanzibar,token_zation,token_zealand,token_zeroduty,token_zhang,token_zimbabwe,token_zimbabwes,token_zingales,token_zoellick,token_zones,token_zoning,token_zscore,token_zusammenarbeit
0,0.998931,0.000373,0.000218,0.046223,2.7e-05,2.2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.998538,0.000398,0.000433,0.054042,2.8e-05,1.6e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.997646,0.000426,0.000118,0.068571,2.7e-05,2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.99749,0.000413,0.000111,0.070811,2.8e-05,2e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.99735,0.001995,0.000965,0.072718,0.000149,8.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [47]:
out_cols = ['row_id',
            'information_and_communication_technologies',
            'governance',
            'urban_development',
            'law_and_development',
            'public_sector_development',
            'agriculture',
            'communities_and_human_settlements',
            'health_and_nutrition_and_population',
            'culture_and_development',
            'social_protections_and_labor',
            'international_economics_and_trade',
            'conflict_and_development',
            'science_and_technology_development',
            'rural_development',
            'poverty_reduction',
            'social_development',
            'education',
            'transport',
            'gender',
            'infrastructure_economics_and_finance',
            'energy_and_environment',
            'finance_and_development',
            'macroeconomics_and_growth', 
            'water'
           ]

In [48]:
submission_df = pd.read_csv('../../wb-publications-data/submission_format.csv')

### SVC Inference

In [49]:
submission__svc_df = pd.read_csv('../../wb-publications-data/submission_format.csv')

In [50]:
for topic in submission__svc_df.columns:
    if topic in topics:       
        with open('./svc_classifiers/' + topic + '.pkl', 'rb') as pickle_file:
            svc = pickle.load(pickle_file)
        submission__svc_df[topic] = svc.predict(df_val[:])
        
submission__svc_df = submission__svc_df.astype(int)

submission__svc_df.to_csv('./submission_svc.csv', index=False)

In [51]:
tmpsvc = pd.read_csv('./submission_svc.csv')

In [52]:
tmpsvc.head()

Unnamed: 0,row_id,information_and_communication_technologies,governance,urban_development,law_and_development,public_sector_development,agriculture,communities_and_human_settlements,health_and_nutrition_and_population,culture_and_development,social_protections_and_labor,international_economics_and_trade,conflict_and_development,science_and_technology_development,rural_development,poverty_reduction,social_development,education,transport,gender,infrastructure_economics_and_finance,energy_and_environment,finance_and_development,macroeconomics_and_growth,water
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0
2,2,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
3,3,0,1,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0


### Multinominal Niave Bayes Inference

In [53]:
submission__lr_df = pd.read_csv('../../wb-publications-data/submission_format.csv')
for topic in submission__lr_df.columns:
    if topic in topics:       
        with open('./lr_classifier/' + topic + '.pkl', 'rb') as pickle_file:
            lr = pickle.load(pickle_file)
        submission__lr_df[topic] = lr.predict(df_val[:])
        
submission__lr_df = submission__lr_df.astype(int)
submission__lr_df.to_csv('./submission_lr.csv', index=False)

In [54]:
tmpsvc = pd.read_csv('./submission_lr.csv')
tmpsvc.head()

Unnamed: 0,row_id,information_and_communication_technologies,governance,urban_development,law_and_development,public_sector_development,agriculture,communities_and_human_settlements,health_and_nutrition_and_population,culture_and_development,social_protections_and_labor,international_economics_and_trade,conflict_and_development,science_and_technology_development,rural_development,poverty_reduction,social_development,education,transport,gender,infrastructure_economics_and_finance,energy_and_environment,finance_and_development,macroeconomics_and_growth,water
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
2,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
3,3,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0
