In [20]:
import chalenge1000
import model_lib
import collections

def vocab(element):
    if type(element) == str:
        return ' '.join(set(element.split(' ')))
    else:
        return ''

col_count = lambda x: ' '.join([a for a,b in collections.Counter(x.split()).most_common(10000)])
lemm = lambda x: model_lib.Lemm(x,file = 'stopwords_agri.txt').X


Chalenge = chalenge1000.Native()

df = Chalenge.X
df['corpus'] = df.join(Chalenge.descriptions_trad)[['prez_struc','prez_produit_struc']].agg(sum, axis = 1)
df['type'] = df['corpus'].map(type)
df['corpus'] = df['corpus'].map(vocab)
df['corpus_lemm'] = df['corpus'].map(lemm)
df['collection_counter'] = df['corpus_lemm'].map(col_count)
df = df[df['type'] ==str]

In [21]:
agriculture = df[df['categorie'] == 'agriculture']
others = df[df['categorie'].isin([ 'media'])]

corpus_agri = agriculture['collection_counter'].agg(sum, axis = 0)
#corpus_agri = model_lib.Lemm(corpus_agri,file = 'stopwords_agri.txt').X
corpus_others = others['collection_counter'].agg(sum, axis = 0)
#corpus_others = model_lib.Lemm(corpus_others,file = 'stopwords_agri.txt').X

counter_agri = collections.Counter(corpus_agri.split()).most_common(5000)
counter_others = collections.Counter(corpus_others.split()).most_common(5000)

counter_agri = list({(a,b) for a,b in counter_agri}.union({(k,0) for k,l in counter_others if k not in [a for a,b in counter_agri]}))
counter_others = list({(a,b) for a,b in counter_others}.union({(k,0) for k,l in counter_agri if k not in [a for a,b in counter_others]}))
counter_agri = {(a, b*100/len(agriculture)) for a,b in counter_agri}
counter_others = {(a, b*100/len(others)) for a,b in counter_others}

import operator

id = operator.itemgetter(0)  # To get id field.

idinfo = {id(rec): rec[1:] for rec in counter_agri}  # Dict for fast look-ups.

merged = [info + idinfo[id(info)] for info in counter_others if id(info) in idinfo]

merged.sort(key=lambda x: x[2], reverse = True)

map_vocab = [(a,c,b, c-b) for a,b,c in merged]
map_vocab.sort(key=lambda x: x[-1], reverse = True)

import pandas

keyword_map = pandas.DataFrame(map_vocab, columns = ['keyword','agriculture', 'others', 'difference']).sort_values('difference')

keyword_map.describe(include= 'all')

keyword_map.sort_values('difference', ascending = False)

subset_1 = keyword_map[(keyword_map['difference'] > 15) |  (keyword_map['difference'] < -15) ]#|  (keyword_map['agriculture'] > 15)]

subset_1.sort_values('difference', ascending = False)

import numpy
percentile = lambda x: numpy.percentile(x,[50, 66, 90,95])
print(percentile(keyword_map[keyword_map['agriculture'] > 0]['agriculture'].values.tolist()))
print(percentile(keyword_map[keyword_map['difference'] > 0]['agriculture'].values.tolist()))
print(percentile(keyword_map[keyword_map['difference'] < 0]['agriculture'].values.tolist()))

numpy.mean(keyword_map[keyword_map['agriculture'] > 0]['agriculture'].values.tolist())

[0.43165468 0.86330935 2.87769784 4.89208633]
[0.43165468 0.71942446 2.58992806 4.60431655]
[0.         0.43165468 1.72661871 3.16546763]


1.277294964028777

# Entrainement

In [22]:
def get_topic(e):
    if e >0:
        return 'agriculture'
    if e<0:
        return 'media'
def true_positve(a,b):
    if a == b:
        return 1
    else:
        return 0
    
    
n_components = 2
n_top_words = 20
n_features = 20000
text_corpus = df[df['categorie'].isin(['agriculture','media'])]['collection_counter']
model = model_lib.Models(n_components = n_components, n_top_words =n_top_words, n_features = n_features)
models = model.run_model_NMF(text_corpus.values.tolist())

predict = pandas.DataFrame(model.reverse_nmf(text_corpus.values.tolist(), models) , index =text_corpus.index)
predict = predict.join(df['categorie'])
predict[0] = predict[0]*100
predict[1] = predict[1]*100
predict['topic'] = predict[0]-predict[1]
predict['predict_label'] = predict['topic'].map(get_topic)
predict['classification_kpi'] = predict.apply(lambda x: true_positve(x['categorie'],x['predict_label']), axis= 1)

done in 0.000s.
Extracting tf-idf features for NMF...
done in 0.084s.
Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2000 and n_features=20000...
done in 0.045s.
Topic #0: product agricultural market production produce food farmer farm make create agriculture local service organic rural sustainable train vegetable order develop
Topic #1: digital communication advertising agency graphic service event design video web strategy public information platform tv creation magazine documentary management print



In [23]:
agri = predict[predict['categorie'] =='agriculture']
print('precision {}'.format(agri['classification_kpi'].sum()/695*100))

errors = agri[agri['classification_kpi'] == 0].sort_values('topic')

collections.Counter(' '.join(errors.join(text_corpus)['collection_counter'].values.tolist()).split()).most_common(10)

precision 98.56115107913669


[('communication', 4),
 ('agricultural', 3),
 ('digital', 3),
 ('service', 3),
 ('study', 2),
 ('product', 2),
 ('promote', 2),
 ('marketing', 2),
 ('consult', 2),
 ('link', 2)]

In [24]:
template = errors.join(chalenge1000.Native().text['temp'])['temp'].values.tolist()
with open('misclassified_agri.txt', 'w') as file:
    for line in template:
        file.writelines(line)

# catégorie abscente

In [25]:
label = pandas.read_csv('datastore.csv').set_index('key_main')
label['corpus'] = label[label['_agriculture_']][['prez_struc','prez_produit_struc']].agg(sum, axis = 1)
label['type'] = label['corpus'].map(type)
label['corpus'] = label['corpus'].map(vocab)
label['corpus_lemm'] = label['corpus'].map(lemm)
label['collection_counter'] = label['corpus_lemm'].map(col_count)
label = label[label['type'] ==str]

n_components = 2
n_top_words = 20
n_features = 20000
text_corpus = label['collection_counter']

predict = pandas.DataFrame(model.reverse_nmf(text_corpus.values.tolist(), models) , index =text_corpus.index)
predict = predict.join(df['categorie'])
predict[0] = predict[0]*100
predict[1] = predict[1]*100
predict['topic'] = predict[0]-predict[1]
predict['predict_label'] = predict['topic'].map(get_topic)
predict['classification_kpi'] = predict.apply(lambda x: true_positve(x['categorie'],x['predict_label']), axis= 1)
predict.sort_values(0, ascending = False)[[0,1,'predict_label']]

Unnamed: 0_level_0,0,1,predict_label
key_main,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
http://www.aquareignindustry.com_mbouleediango_marcidriss,6.532205,0.0,agriculture
https://web.facebook.com/faycal.aguemon_aguemon_shegun,5.754666,0.0,agriculture
https://www.2mi-sa.com/_djermakoye_reki,3.6553,0.0,agriculture
http://www.congofresh.com_enge_albert,3.463754,0.0,agriculture
https://www.youtube.com/results?search_query=gue+douade+marius+biographie_gue_douademarius,2.924626,0.0,agriculture
http://www.leader-manioc-96.websel.net/site%20map.xml_adognon_alidaodilekayissan,2.779411,0.0,agriculture
https://4piment.blogspot.com/_sinare_idrissa,2.149734,0.0,agriculture
http://YamAgroIndustrie.bf_yameogo_stevearmandleonardwendkouni,2.094783,0.0,agriculture
https://www.le-lionceau.com/_samba_siny,2.081692,6.926064,media
https://www.trebalt.com/_bah_anais,2.051501,3.412627,media


# Startup non inscrits en agriculture

In [26]:
df = Chalenge.X
df['corpus'] = df.join(Chalenge.descriptions_trad)[['prez_struc','prez_produit_struc']].agg(sum, axis = 1)
df['type'] = df['corpus'].map(type)
df['corpus'] = df['corpus'].map(vocab)
df['corpus_lemm'] = df['corpus'].map(lemm)
df['collection_counter'] = df['corpus_lemm'].map(col_count)
df = df[df['type'] ==str]

n_components = 2
n_top_words = 20
n_features = 20000
text_corpus = df[df['categorie']!= 'agriculture']['collection_counter']

predict = pandas.DataFrame(model.reverse_nmf(text_corpus.values.tolist(), models) , index =text_corpus.index)
predict = predict.join(df['categorie'])
predict[0] = predict[0]*100
predict[1] = predict[1]*100
predict['topic'] = predict[0]-predict[1]
predict['predict_label'] = predict['topic'].map(get_topic)
predict['classification_kpi'] = predict.apply(lambda x: true_positve(x['categorie'],x['predict_label']), axis= 1)
#predict.sort_values(0, ascending = False)[[0,1,'predict_label']]

In [27]:
#predict.sort_values(0, ascending = False).describe()

In [28]:
#predict[predict['topic'] > 10].sort_values(0, ascending = False).groupby(['categorie','predict_label']).nunique()#.describe(include ='all')

In [29]:
predict[predict['topic']>10].join(chalenge1000.Native().text['temp'])[['categorie','predict_label']]

Unnamed: 0_level_0,categorie,predict_label
key_main,Unnamed: 1_level_1,Unnamed: 2_level_1
http://www.sukuchai.com_kuchengo_suleiman,autres,agriculture
https://www.facebook.com/Community-Action-for-Agriculture-and-Sustainable-Development-466865864068319/_noellifogha_ndze,education,agriculture
http://www.cleanprogb.com%20en%20contruction_etimbibang_confort,finance,agriculture
https://afrikapu.com/_mohamed_nasreenali,ecommerce,agriculture
https://www.facebook.com/TESSA-POWER-SARL-1540172142980252/?ref=bookmarks_ousmane_mahamanlaouali,energie,agriculture
...,...,...
https://solarkoodo.com/_nana_safiatou,energie,agriculture
http://www.agrolimgroceriesuganda.com/_denis_muhereza,ecommerce,agriculture
http://www.afrimart.com.ng/_ojeniyi_olajide,ecommerce,agriculture
http://www.me.bf/newkieta_zabsonre_abdoulaziz,mobilite,agriculture


In [31]:
a = predict[predict['topic']>10].join(chalenge1000.Native().text['temp'])['temp'].values.tolist()
print(a[0])

Nom du répondant : Kuchengo 
Prénom du répondant : Suleiman 
email : suleykuchengo@yahoo.com 
 
Nom de la structure : SUKU CHAI 
Pays d'origine : Tanzania 
Pays d'activité : Tanzania 
Site internet : http://www.sukuchai.com 
Linkedin : https://www.linkedin.com/in/suleiman-kuchengo-05a370123/?originalSubdomain=tz 
 
Présentation de la structure 
Our main goal is to improve lives in remote, impoverished and neglected rural areas by creating improved access to productive resources as well as meaningful jobs to single mothers, widows, people with disabilities and young girls through tea farming and value addition.
Our approach is based on building an inclusive economy as we are challenging and disrupting the status quo by changing the way marginalized groups and excluded communities were left in the stake of the economy by giving them the necessary skills, tools, and resources to own the economy.
What we do is to empower those marginalized and neglected groups through training and incentiv