In [1]:
from sklearn.model_selection import train_test_split
import chalenge1000
import model_lib
import collections
from IPython.display import display, HTML
import pandas


Chalenge = chalenge1000.Native()
df = Chalenge.X

In [3]:
def vocab(element):
    if type(element) == str:
        return ' '.join(set(element.split(' ')))
    else:
        return ''

def _agriculture__(element):
    if element in agriculture:
        return True
    else:
        return False
    
col_count = lambda x: ' '.join([a for a,b in collections.Counter(x.split()).most_common(10000)])

lemm = lambda x: model_lib.Lemm(x,file = 'stopwords_agri.txt').X

def df_fmt(df):
    df['corpus'] = df.join(Chalenge.descriptions_trad)[['prez_struc','prez_produit_struc']].agg(sum, axis = 1)
    df['type'] = df['corpus'].map(type)
    df['corpus'] = df['corpus'].map(vocab)
    df['corpus_lemm'] = df['corpus'].map(lemm)
    df['collection_counter'] = df['corpus_lemm'].map(col_count)
    agr = {e for e in df['cat_autre_struc'].dropna().values.tolist() if 'agr' in e}
    alim = {e for e in df['cat_autre_struc'].dropna().values.tolist() if 'alim' in e}
    agriculture = alim.union(agr)
    #df = df[df['type'] ==str]
    return df

def perf_lda(model_parameters,train, test):
    
    def get_topic(e):
        if e >0:
            return 'agriculture'
        if e<0:
            return 'media'
    
    def true_positve(a,b):
        if a == b:
            return 1
        else:
            return 0
        
    model = model_lib.Models(**model_parameters)

    models = model.run_model_LDA(train.values.tolist())

    predict = pandas.DataFrame(model.reverse_lda(test.values.tolist(), models) , index =test.index)
    #display(predict)

    predict = predict.join(df['categorie'])
    predict[0] = predict[0]*100
    predict[1] = predict[1]*100
    predict['topic'] = predict[0]-predict[1]
    #display(predict)

    predict['predict_label'] = predict['topic'].map(get_topic)
    #display(predict)

    predict['classification_kpi'] = predict.apply(lambda x: true_positve(x['categorie'],x['predict_label']), axis= 1)
    #display(predict)

    agri = predict[predict['categorie'] == 'agriculture']
    print(model_parameters)
    print('precision {}'.format(agri['classification_kpi'].sum()/len(agri)*100))
    model_parameters.update({'precision':agri['classification_kpi'].sum()/len(agri)*100, 'vocabulaire': models['LDA']['feature_names']})
    errors = agri[agri['classification_kpi'] == 0].sort_values('topic')
    collections.Counter(' '.join(errors.join(text_corpus)['collection_counter'].values.tolist()).split()).most_common(10)
    return model_parameters


In [4]:
df = df_fmt(df)

text_corpus = df[(df['categorie'].isin(['agriculture','media'])) & (df['type'] ==str)]['collection_counter']
text_corpus_label = df[df['categorie'].isin(['agriculture','media']) & (df['type'] ==str)][['categorie','collection_counter']]


In [5]:
test,train = train_test_split(text_corpus_label, test_size = 0.2)

print(len(text_corpus), len(test), len(train))

display(train.groupby('categorie').count())

display(text_corpus_label.groupby('categorie').count())

train = train['collection_counter']
test = test['collection_counter']
model_parameters = {'n_components' : 2, 'n_top_words' : 20, 'n_features' : 20000}
_ = perf_lda(model_parameters, train, test)

787 629 158


Unnamed: 0_level_0,collection_counter
categorie,Unnamed: 1_level_1
agriculture,141
media,17


Unnamed: 0_level_0,collection_counter
categorie,Unnamed: 1_level_1
agriculture,695
media,92


Extracting tf features for LDA...
done in 0.030s.

Fitting LDA models with tf features, n_features=20000...
done in 0.551s.

Topics in LDA model:
Topic #0: product production create market local service agricultural food farmer make farm produce social young sustainable fruit rural agriculture group develop
Topic #1: product energy renewable waste production service seed oil peanut paste market recycle produce plastic public create end cover transforms farm

{'n_components': 2, 'n_top_words': 20, 'n_features': 20000}
precision 99.27797833935018


In [6]:
list_model_parameters_nfeatures = [{'n_components' : 2, 'n_top_words' : 20, 'n_features' : n_features, 'doc_topic_prior':alpha/10} for alpha in range(1,100,1) for n_features in range(500,5000,100)]

list_run = []
for model_parameters in list_model_parameters_nfeatures:
    list_run.append(perf_lda(model_parameters,train, test))
results = pandas.DataFrame(list_run).sort_values('precision', ascending = False)
results.sort_values('precision', ascending = False)

Extracting tf features for LDA...
done in 0.029s.

Fitting LDA models with tf features, n_features=500...
done in 0.384s.

Topics in LDA model:
Topic #0: service product water energy maintenance school technology sustainable rural make access cover oil create renewable digital equipment power environment sustainability
Topic #1: product production create market local agricultural produce food farm farmer make social young fruit service group agriculture develop sustainable rural

{'n_components': 2, 'n_top_words': 20, 'n_features': 500, 'doc_topic_prior': 0.1}
precision 2.527075812274368
Extracting tf features for LDA...
done in 0.017s.

Fitting LDA models with tf features, n_features=600...
done in 0.353s.

Topics in LDA model:
Topic #0: create community rural local make market young social child information affordable organization knowledge access establish service maintenance technology sustainable strong
Topic #1: product production market agricultural create service local food pro

Unnamed: 0,n_components,n_top_words,n_features,doc_topic_prior,precision,vocabulaire
144,2,20,1400,0.4,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
54,2,20,1400,0.2,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
99,2,20,1400,0.3,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
9,2,20,1400,0.1,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
189,2,20,1400,0.5,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
...,...,...,...,...,...,...
46,2,20,600,0.2,0.180505,"[access, accessible, accompany, account, acqui..."
50,2,20,1000,0.2,0.000000,"[abandon, ability, access, accessible, accompa..."
95,2,20,1000,0.3,0.000000,"[abandon, ability, access, accessible, accompa..."
5,2,20,1000,0.1,0.000000,"[abandon, ability, access, accessible, accompa..."


In [12]:
results.sort_values('precision', ascending = False)

Unnamed: 0,n_components,n_top_words,n_features,doc_topic_prior,precision,vocabulaire
144,2,20,1400,0.4,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
54,2,20,1400,0.2,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
99,2,20,1400,0.3,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
9,2,20,1400,0.1,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
189,2,20,1400,0.5,99.638989,"[abandon, ability, abroad, absorb, access, acc..."
...,...,...,...,...,...,...
46,2,20,600,0.2,0.180505,"[access, accessible, accompany, account, acqui..."
50,2,20,1000,0.2,0.000000,"[abandon, ability, access, accessible, accompa..."
95,2,20,1000,0.3,0.000000,"[abandon, ability, access, accessible, accompa..."
5,2,20,1000,0.1,0.000000,"[abandon, ability, access, accessible, accompa..."
