In [1]:
from sklearn.model_selection import train_test_split
import chalenge1000
import model_lib
import collections
from IPython.display import display, HTML
import pandas


Chalenge = chalenge1000.Native()
df = Chalenge.X

In [32]:
def vocab(element):
    if type(element) == str:
        return ' '.join(set(element.split(' ')))
    else:
        return ''

def _agriculture__(element):
    if element in agriculture:
        return True
    else:
        return False
    
col_count = lambda x: ' '.join([a for a,b in collections.Counter(x.split()).most_common(10000)])

lemm = lambda x: model_lib.Lemm(x,file = 'stopwords_agri.txt').X

def df_fmt(df):
    df['corpus'] = df.join(Chalenge.descriptions_trad)[['prez_struc','prez_produit_struc']].agg(sum, axis = 1)
    df['type'] = df['corpus'].map(type)
    df['corpus'] = df['corpus'].map(vocab)
    df['corpus_lemm'] = df['corpus'].map(lemm)
    df['collection_counter'] = df['corpus_lemm'].map(col_count)
    agr = {e for e in df['cat_autre_struc'].dropna().values.tolist() if 'agr' in e}
    alim = {e for e in df['cat_autre_struc'].dropna().values.tolist() if 'alim' in e}
    agriculture = alim.union(agr)
    #df = df[df['type'] ==str]
    return df

def perf_lda(model_parameters,train, test):
    
    def get_topic(e):
        if e>0:
            return 'agriculture'
        if e<0:
            return 'media'
    
    def true_positve(a,b):
        if a == b:
            return 1
        else:
            return 0
        
    model = model_lib.Models(**model_parameters)

    models = model.run_model_LDA(train.values.tolist())

    predict = pandas.DataFrame(model.reverse_lda(test.values.tolist(), models) , index =test.index)
    #display(predict)

    predict = predict.join(df['categorie'])
    predict[0] = predict[0]*100
    predict[1] = predict[1]*100
    predict['topic'] = predict[0]-predict[1]
    #display(predict)

    predict['predict_label'] = predict['topic'].map(get_topic)
    #display(predict)

    predict['classification_kpi'] = predict.apply(lambda x: true_positve(x['categorie'],x['predict_label']), axis= 1)
    #display(predict)

    agri = predict[predict['categorie'] == 'agriculture']
    media = predict[predict['categorie'] == 'media']
    print(model_parameters)
    print('precision {}'.format(agri['classification_kpi'].sum()/len(agri)*100))
    model_parameters.update({'precision':agri['classification_kpi'].sum()/len(agri)*100, 'vocabulaire': models['LDA']['feature_names']})
    errors = agri[agri['classification_kpi'] == 0].sort_values('topic')
    collections.Counter(' '.join(errors.join(text_corpus)['collection_counter'].values.tolist()).split()).most_common(10)
    return model_parameters, agri, media


In [33]:
df = df_fmt(df)

text_corpus = df[(df['categorie'].isin(['agriculture','media'])) & (df['type'] ==str)]['collection_counter']
text_corpus_label = df[df['categorie'].isin(['agriculture','media']) & (df['type'] ==str)][['categorie','collection_counter']]


In [53]:
test,train = train_test_split(text_corpus_label, test_size = 0.2)

print(len(text_corpus), len(test), len(train))

display(train.groupby('categorie').count())

display(text_corpus_label.groupby('categorie').count())

train = train['collection_counter']
test = test['collection_counter']
model_parameters = {'n_components' : 2, 'n_top_words' : 20, 'n_features' : 200, 'doc_topic_prior': 0.1}
_, agri, media = perf_lda(model_parameters, train, test)

787 629 158


Unnamed: 0_level_0,collection_counter
categorie,Unnamed: 1_level_1
agriculture,135
media,23


Unnamed: 0_level_0,collection_counter
categorie,Unnamed: 1_level_1
agriculture,695
media,92


Extracting tf features for LDA...
done in 0.032s.

Fitting LDA models with tf features, n_features=200...
done in 0.455s.

Topics in LDA model:
Topic #0: product market production make produce service create food farmer local farm agricultural develop agriculture vegetable young health rural order management
Topic #1: waste recycle farmer product water construction material irrigation collect farm create complete production technology model land fresh affordable fruit food

{'n_components': 2, 'n_top_words': 20, 'n_features': 200, 'doc_topic_prior': 0.1}
precision 98.75


In [54]:
agri

Unnamed: 0_level_0,0,1,categorie,topic,predict_label,classification_kpi
key_main,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
http://www.agrobut.ml_ouattara_oumarsanaye,99.618260,0.381740,agriculture,99.236520,agriculture,1
http://www.linkedin.com/in/irénée-kamanzi-4b7b9813_irenee_kamanzi,98.610870,1.389130,agriculture,97.221740,agriculture,1
http://www.lespiedsdansleau49.fr_reveille_patricia,99.106985,0.893015,agriculture,98.213970,agriculture,1
http://swedco.nu_kiela_teji,98.610756,1.389244,agriculture,97.221512,agriculture,1
http://www.kgaisanofreshharvest.co.za_nyokong_lerato,98.386845,1.613155,agriculture,96.773691,agriculture,1
...,...,...,...,...,...,...
https://www.facebook.com/jojo.mensa/_koudjodji_yaomensa,99.618243,0.381757,agriculture,99.236486,agriculture,1
http://www.biofoodtechtz.com_sekenya_japhet,99.603101,0.396899,agriculture,99.206203,agriculture,1
https://www.facebook.com/Harena-Ecologique-115427253161322/?modal=admin_todo_tour_ravaviharinosy_andonantenainabakomampiononaandrea,99.295634,0.704366,agriculture,98.591268,agriculture,1
https://www.facebook.com/leonkingcoffee/_kpelly_koamielompaul,99.657459,0.342541,agriculture,99.314917,agriculture,1


In [55]:
media.groupby('predict_label').count()

Unnamed: 0_level_0,0,1,categorie,topic,classification_kpi
predict_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
agriculture,68,68,68,68,68


In [56]:
list_model_parameters_nfeatures = [{'n_components' : 2, 'n_top_words' : 20, 'n_features' : n_features, 'doc_topic_prior':alpha/10} for alpha in range(1,100,1) for n_features in range(500,len(),100)]

list_run = []
for model_parameters in list_model_parameters_nfeatures:
    list_run.append(perf_lda(model_parameters,train, test))
results = pandas.DataFrame(list_run).sort_values('precision', ascending = False)
results.sort_values('precision', ascending = False)

TypeError: len() takes exactly one argument (0 given)

In [31]:
results.sort_values('doc_topic_prior', ascending = True)

Unnamed: 0,n_components,n_top_words,n_features,doc_topic_prior,precision,vocabulaire
35,2,20,4000,0.1,99.277978,"[abandon, ability, abroad, absorb, academic, a..."
14,2,20,1900,0.1,99.277978,"[abandon, ability, abroad, absorb, academic, a..."
15,2,20,2000,0.1,99.277978,"[abandon, ability, abroad, absorb, academic, a..."
16,2,20,2100,0.1,99.277978,"[abandon, ability, abroad, absorb, academic, a..."
17,2,20,2200,0.1,99.277978,"[abandon, ability, abroad, absorb, academic, a..."
...,...,...,...,...,...,...
4438,2,20,3300,9.9,66.064982,"[abandon, ability, abroad, absorb, academic, a..."
4435,2,20,3000,9.9,66.064982,"[abandon, ability, abroad, absorb, academic, a..."
4450,2,20,4500,9.9,66.064982,"[abandon, ability, abroad, absorb, academic, a..."
4430,2,20,2500,9.9,66.064982,"[abandon, ability, abroad, absorb, academic, a..."


In [23]:
len(results.sort_values('n_features', ascending = False).head(1)['vocabulaire'].values.tolist()[0])

1594

In [27]:
test.values.tolist()

['insect source production purpose produce gain relieve protein quantity also amount within stockssusento combine usd live wild fly next fish expertise meet could egg compare pressure byproduct market net therefore advantage larva chain highquality current sustainable farming concept soldier per rise dry novel edible currenly agricultural industry constant animal nutrient rear soy greenhouse larvae annual consistent susento knowledge food formulation estimate practice emission deforestation gas fee requirement water totally compounded attention associate whole',
 'farm create market transportation tap farmer wennovation agritech city move directly bank collaboration marketwe achievement tackle much rural bottle access wastage system incubated bulk play recur remain neck believe ussd indirectly agricultural nightmare produce limitless channel accessible paris collaborate food hub community plug focused cropcash stakeholder mouves thats refer httpmouvesorg cropcashcomng platform',
 'nugi

In [26]:
test.keys()

Index(['https://www.susento.com_woods_michaeljosias',
       'http://crop2cash.com.ng_atanda_david',
       'http://nugitech.com_asikpo_anthonia',
       'http://www.snowballplus.com.ng_adediran_damola',
       'http://www.oilfishey.co.za_legoete_lemao',
       'http://www.aiconmedia.co.zw_ndlovu_nigel',
       'http://www.soilbiomuti.com_snyman_petrusjakobus',
       'http://www.footmokit.com_richard_mushusha',
       'https://pontso.wixsite.com/yellowbeasttech/_moletsane_pontsho',
       'https://www.publiseer.com_chidi_nwaogu',
       ...
       'http://www.topanigeria.com_oyeleke_bola',
       'https://nakondemillinglimited.websites.co.in_simaye_abiud',
       'http://www.pixcomm.co.za_buchanan_nic',
       'http://www.nseabasiphoto.com_akpan_nseabasi',
       'https://wa.me/221786018057_niang_abdaramani',
       'https://www.facebook.com/khaya.maloney_maloney_khaya',
       'http://www.ozebio.com_bakayoko_kadi',
       'http://www.JUROSARLU5.com_apezoukin_komlanmokpokpo',
       '