In [1]:
import chalenge1000
import model_lib
import collections
from IPython.display import display, HTML
import pandas

Chalenge = chalenge1000.Native()
df = Chalenge.X

In [2]:
def vocab(element):
    if type(element) == str:
        return ' '.join(set(element.split(' ')))
    else:
        return ''

def _agriculture__(element):
    if element in agriculture:
        return True
    else:
        return False
    
col_count = lambda x: ' '.join([a for a,b in collections.Counter(x.split()).most_common(10000)])
lemm = lambda x: model_lib.Lemm(x,file = 'stopwords_agri.txt').X

def df_fmt(df):
    df['corpus'] = df.join(Chalenge.descriptions_trad)[['prez_struc','prez_produit_struc']].agg(sum, axis = 1)
    df['type'] = df['corpus'].map(type)
    df['corpus'] = df['corpus'].map(vocab)
    df['corpus_lemm'] = df['corpus'].map(lemm)
    df['collection_counter'] = df['corpus_lemm'].map(col_count)
    agr = {e for e in df['cat_autre_struc'].dropna().values.tolist() if 'agr' in e}
    alim = {e for e in df['cat_autre_struc'].dropna().values.tolist() if 'alim' in e}
    agriculture = alim.union(agr)
    #df = df[df['type'] ==str]
    return df

In [4]:
df = df_fmt(df)

text_corpus = df[(df['categorie'].isin(['agriculture','media'])) & (df['type'] ==str)]['collection_counter']
text_corpus_label = df[df['categorie'].isin(['agriculture','media']) & (df['type'] ==str)][['categorie','collection_counter']]

In [5]:

#model = model_lib.Models(n_components = n_components, n_top_words =n_top_words, n_features = n_features)
def run_model_lda(model_parameters,text_corpus):
    def get_topic(e):
        if e >0:
            return 'agriculture'
        if e<0:
            return 'media'
    
    def true_positve(a,b):
        if a == b:
            return 1
        else:
            return 0
    model = model_lib.Models(**model_parameters)

    models = model.run_model_LDA(text_corpus.values.tolist())

    predict = pandas.DataFrame(model.reverse_lda(text_corpus.values.tolist(), models) , index =text_corpus.index)
    #display(predict)

    predict = predict.join(df['categorie'])
    predict[0] = predict[0]*100
    predict[1] = predict[1]*100
    predict['topic'] = predict[0]-predict[1]
    #display(predict)

    predict['predict_label'] = predict['topic'].map(get_topic)
    #display(predict)

    predict['classification_kpi'] = predict.apply(lambda x: true_positve(x['categorie'],x['predict_label']), axis= 1)
    #display(predict)

    agri = predict[predict['categorie'] == 'agriculture']
    print(model_parameters)
    print('precision {}'.format(agri['classification_kpi'].sum()/len(agri)*100))
    model_parameters.update({'precision':agri['classification_kpi'].sum()/len(agri)*100, 'vocabulaire': models['LDA']['feature_names']})
    errors = agri[agri['classification_kpi'] == 0].sort_values('topic')
    collections.Counter(' '.join(errors.join(text_corpus)['collection_counter'].values.tolist()).split()).most_common(10)
    return model_parameters

In [6]:
list_model_parameters_prior = [{'n_components' : 2, 'n_top_words' : 20, 'n_features' : 20000, 'doc_topic_prior':alpha} for alpha in range(1,100,1)]

list_run = []
for model_parameters in list_model_parameters_prior:
    list_run.append(run_model_lda(model_parameters,text_corpus))
result_priors = pandas.DataFrame(list_run).sort_values('precision', ascending = False)

Extracting tf features for LDA...
done in 0.143s.

Fitting LDA models with tf features, n_features=20000...
done in 1.579s.

Topics in LDA model:
Topic #0: product market production make create agricultural produce food service farmer local farm agriculture young woman sustainable rural develop management order
Topic #1: zou product breed en farmer promoter market fish régional creation feed entrepreneures agricoles recreation conseil production collines ponds tilapia presentation

{'n_components': 2, 'n_top_words': 20, 'n_features': 20000, 'doc_topic_prior': 1}
precision 97.41007194244604
Extracting tf features for LDA...
done in 0.071s.

Fitting LDA models with tf features, n_features=20000...
done in 1.848s.

Topics in LDA model:
Topic #0: product market make production agricultural create produce food service farmer local agriculture farm young sustainable rural develop management woman order
Topic #1: product production market oil fruit create palm produce food agricultural juice 

In [7]:
model_parameters = {'n_components' : 2, 'n_top_words' : 20, 'n_features' : 20000, 'doc_topic_prior':None}
run_model_lda(model_parameters,text_corpus)

Extracting tf features for LDA...
done in 0.098s.

Fitting LDA models with tf features, n_features=20000...
done in 1.300s.

Topics in LDA model:
Topic #0: product market production make create agricultural produce food farmer service local farm agriculture young woman sustainable rural develop management order
Topic #1: régional ponds recreation entrepreneures agricoles conseil zou collines product tel breed email une pond president promoter council farmer en tilapia

{'n_components': 2, 'n_top_words': 20, 'n_features': 20000, 'doc_topic_prior': None}
precision 98.41726618705036


{'n_components': 2,
 'n_top_words': 20,
 'n_features': 20000,
 'doc_topic_prior': None,
 'precision': 98.41726618705036,
 'vocabulaire': ['ab',
  'abandon',
  'abc',
  'abeokuta',
  'ability',
  'abomeycalavi',
  'abound',
  'abroad',
  'absence',
  'absorb',
  'abuja',
  'abundance',
  'abundant',
  'abuse',
  'ac',
  'academic',
  'academy',
  'accelerate',
  'acceleration',
  'accelerator',
  'accept',
  'access',
  'accessibility',
  'accessible',
  'accessory',
  'accident',
  'accompanies',
  'accompaniment',
  'accompany',
  'accompanying',
  'accordance',
  'account',
  'accountant',
  'accounting',
  'accurate',
  'achievable',
  'achievement',
  'acid',
  'acidic',
  'acidity',
  'acknowledge',
  'acquire',
  'acquisition',
  'acre',
  'acronym',
  'act',
  'active',
  'actively',
  'actor',
  'actual',
  'actually',
  'ad',
  'adapt',
  'adaptable',
  'adaptation',
  'adapted',
  'adaptive',
  'adapts',
  'additional',
  'additionally',
  'additive',
  'additives',
  'addres

In [8]:
list_model_parameters_nfeatures = [{'n_components' : 2, 'n_top_words' : 20, 'n_features' : n_features, 'doc_topic_prior':None} for n_features in range(500,5000,100)]

list_run = []
for model_parameters in list_model_parameters_nfeatures:
    list_run.append(run_model_lda(model_parameters,text_corpus))
results_n_features = pandas.DataFrame(list_run).sort_values('precision', ascending = False)

Extracting tf features for LDA...
done in 0.087s.

Fitting LDA models with tf features, n_features=500...
done in 2.504s.

Topics in LDA model:
Topic #0: service agricultural make market create farmer management product agriculture platform young develop community local field access technology rural production farm
Topic #1: product production produce food market make create agricultural woman local farm fruit farmer organic vegetable natural plant agriculture oil sustainable

{'n_components': 2, 'n_top_words': 20, 'n_features': 500, 'doc_topic_prior': None}
precision 39.13669064748201
Extracting tf features for LDA...
done in 0.078s.

Fitting LDA models with tf features, n_features=600...
done in 2.324s.

Topics in LDA model:
Topic #0: service farmer platform market agricultural make create community access management rural technology social develop product agriculture information digital young farm
Topic #1: product production market produce food make create agricultural local woman 

In [9]:
result_priors.sort_values('precision', ascending = False)

Unnamed: 0,n_components,n_top_words,n_features,doc_topic_prior,precision,vocabulaire
0,2,20,20000,1,97.410072,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
1,2,20,20000,2,87.050360,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
2,2,20,20000,3,71.223022,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
3,2,20,20000,4,59.280576,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
4,2,20,20000,5,52.949640,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
...,...,...,...,...,...,...
24,2,20,20000,25,39.856115,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
20,2,20,20000,21,39.856115,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
21,2,20,20000,22,39.712230,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
22,2,20,20000,23,39.712230,"[ab, abandon, abc, abeokuta, ability, abomeyca..."


In [10]:
results_n_features.sort_values('precision', ascending = False)

Unnamed: 0,n_components,n_top_words,n_features,doc_topic_prior,precision,vocabulaire
25,2,20,3000,,98.848921,"[abandon, abeokuta, ability, abroad, absorb, a..."
16,2,20,2100,,98.705036,"[abandon, ability, abroad, academic, academy, ..."
44,2,20,4900,,98.417266,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
37,2,20,4200,,98.417266,"[ab, abandon, abc, abeokuta, ability, abroad, ..."
43,2,20,4800,,98.417266,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
42,2,20,4700,,98.417266,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
41,2,20,4600,,98.417266,"[ab, abandon, abc, abeokuta, ability, abomeyca..."
9,2,20,1400,,98.417266,"[ability, abroad, access, accessibility, acces..."
21,2,20,2600,,97.985612,"[abandon, ability, abroad, abundance, abundant..."
33,2,20,3800,,97.841727,"[ab, abandon, abeokuta, ability, abroad, absor..."


In [39]:
def perf_lda(model_parameters,train, test):
    def get_topic(e):
        if e >0:
            return 'agriculture'
        if e<0:
            return 'media'
    
    def true_positve(a,b):
        if a == b:
            return 1
        else:
            return 0
    model = model_lib.Models(**model_parameters)

    models = model.run_model_LDA(train.values.tolist())

    predict = pandas.DataFrame(model.reverse_lda(test.values.tolist(), models) , index =test.index)
    #display(predict)

    predict = predict.join(df['categorie'])
    predict[0] = predict[0]*100
    predict[1] = predict[1]*100
    predict['topic'] = predict[0]-predict[1]
    #display(predict)

    predict['predict_label'] = predict['topic'].map(get_topic)
    #display(predict)

    predict['classification_kpi'] = predict.apply(lambda x: true_positve(x['categorie'],x['predict_label']), axis= 1)
    #display(predict)

    agri = predict[predict['categorie'] == 'agriculture']
    print(model_parameters)
    print('precision {}'.format(agri['classification_kpi'].sum()/len(agri)*100))
    model_parameters.update({'precision':agri['classification_kpi'].sum()/len(agri)*100, 'vocabulaire': models['LDA']['feature_names']})
    errors = agri[agri['classification_kpi'] == 0].sort_values('topic')
    collections.Counter(' '.join(errors.join(text_corpus)['collection_counter'].values.tolist()).split()).most_common(10)
    return model_parameters

In [55]:
from sklearn.model_selection import train_test_split

test,train = train_test_split(text_corpus_label, test_size = 0.2)

print(len(text_corpus), len(test), len(train))

display(train.groupby('categorie').count())

display(text_corpus_label.groupby('categorie').count())

train = train['collection_counter']
test = test['collection_counter']
model_parameters = {'n_components' : 2, 'n_top_words' : 20, 'n_features' : 20000}
_ = perf_lda(model_parameters, train, test)

787 629 158


Unnamed: 0_level_0,collection_counter
categorie,Unnamed: 1_level_1
agriculture,143
media,15


Unnamed: 0_level_0,collection_counter
categorie,Unnamed: 1_level_1
agriculture,695
media,92


Extracting tf features for LDA...
done in 0.068s.

Fitting LDA models with tf features, n_features=20000...
done in 1.031s.

Topics in LDA model:
Topic #0: product market production make create food agricultural service produce young farm farmer agriculture social develop local management rural job promote
Topic #1: farmer product food make farm service produce production directly platform order create waste packaging feed access web market advice application

{'n_components': 2, 'n_top_words': 20, 'n_features': 20000}
precision 97.64492753623189


In [57]:
list_model_parameters_prior = [{'n_components' : 2, 'n_top_words' : 20, 'n_features' : 20000, 'doc_topic_prior':alpha} for alpha in range(1,100,1)]

list_run = []
for model_parameters in list_model_parameters_prior:
    list_run.append(perf_lda(model_parameters, train, test))
result_priors = pandas.DataFrame(list_run).sort_values('precision', ascending = False)

Extracting tf features for LDA...
done in 0.045s.

Fitting LDA models with tf features, n_features=20000...
done in 0.976s.

Topics in LDA model:
Topic #0: product market production make create food agricultural service produce young farm agriculture social farmer develop local management rural community promote
Topic #1: farmer product food farm make service produce production create order market platform mobile directly access application web agricultural waste agriculture

{'n_components': 2, 'n_top_words': 20, 'n_features': 20000, 'doc_topic_prior': 1}
precision 95.47101449275362
Extracting tf features for LDA...
done in 0.016s.

Fitting LDA models with tf features, n_features=20000...
done in 0.631s.

Topics in LDA model:
Topic #0: product market make production create food agricultural service young produce social agriculture farm develop management local job rural community promote
Topic #1: product farmer food service farm make production produce market create agricultural orde

In [58]:
list_model_parameters_nfeatures = [{'n_components' : 2, 'n_top_words' : 20, 'n_features' : n_features, 'doc_topic_prior':None} for n_features in range(500,5000,100)]

list_run = []
for model_parameters in list_model_parameters_nfeatures:
    list_run.append(perf_lda(model_parameters,train, test))
results_n_features = pandas.DataFrame(list_run).sort_values('precision', ascending = False)

Extracting tf features for LDA...
done in 0.136s.

Fitting LDA models with tf features, n_features=500...
done in 0.596s.

Topics in LDA model:
Topic #0: make product food agriculture farm market create association produce require production farmer training fruit field agricultural sustainable chicken powder young
Topic #1: product market make food production create service agricultural farmer produce farm agriculture young local develop rural management social order promote

{'n_components': 2, 'n_top_words': 20, 'n_features': 500, 'doc_topic_prior': None}
precision 0.18115942028985507
Extracting tf features for LDA...
done in 0.015s.

Fitting LDA models with tf features, n_features=600...
done in 0.526s.

Topics in LDA model:
Topic #0: create market service digital design creative establish organisation online innovation computer respect event field exchange access individual maintain make develop
Topic #1: product make food market production create farmer agricultural service produc

In [None]:
list_model_parameters_nfeatures = [{'n_components' : 2, 'n_top_words' : 20, 'n_features' : n_features, 'doc_topic_prior':alpha/10} for alpha in range(1,100,1) for n_features in range(500,5000,100)]

list_run = []
for model_parameters in list_model_parameters_nfeatures:
    list_run.append(perf_lda(model_parameters,train, test))
results = pandas.DataFrame(list_run).sort_values('precision', ascending = False)
results.sort_values('precision', ascending = False)

Extracting tf features for LDA...
done in 0.054s.

Fitting LDA models with tf features, n_features=500...
done in 0.414s.

Topics in LDA model:
Topic #0: farm product make food agriculture market association create chicken powder health service machine poultry restaurant produce require capital farmer production
Topic #1: product market make food production create service agricultural farmer produce farm agriculture young local develop rural management social order promote

{'n_components': 2, 'n_top_words': 20, 'n_features': 500, 'doc_topic_prior': 0.1}
precision 0.18115942028985507
Extracting tf features for LDA...
done in 0.016s.

Fitting LDA models with tf features, n_features=600...
done in 0.302s.

Topics in LDA model:
Topic #0: make develop access exchange platform mobile app share organisation online search network contact connect list professional external continue feature direct
Topic #1: product market make food production create service agricultural farmer produce farm agri

In [65]:
results.sort_values('doc_topic_prior')

Unnamed: 0,n_components,n_top_words,n_features,doc_topic_prior,precision,vocabulaire
12,2,20,1700,0.1,97.644928,"[ability, abroad, absorb, academic, access, ac..."
6,2,20,1100,0.1,0.543478,"[ability, abroad, absorb, access, accessibilit..."
5,2,20,1000,0.1,0.181159,"[ability, abroad, absorb, access, accessibilit..."
34,2,20,3900,0.1,97.644928,"[ability, abroad, absorb, academic, access, ac..."
18,2,20,2300,0.1,97.644928,"[ability, abroad, absorb, academic, access, ac..."
...,...,...,...,...,...,...
4436,2,20,3100,9.9,56.702899,"[ability, abroad, absorb, academic, access, ac..."
4435,2,20,3000,9.9,56.702899,"[ability, abroad, absorb, academic, access, ac..."
4446,2,20,4100,9.9,56.702899,"[ability, abroad, absorb, academic, access, ac..."
4423,2,20,1800,9.9,56.702899,"[ability, abroad, absorb, academic, access, ac..."


In [151]:
challenge =chalenge1000.Native().ALL
header = challenge.keys()#[['pays_struc1', 'pays_struc2']]
def reduce_dim_pays(element):
    return sorted(list(set(element.split('-----'))),key = lambda x: len(x), reverse = True)
challenge['pays'] = challenge['pays_struc1'].map(reduce_dim_pays)
challenge['pays2'] = challenge['pays_struc2'].map(reduce_dim_pays)
country = challenge[['pays','pays2']]

Index(['nom_struc', 'categorie', 'cat_struc', 'cat_autre_struc', 'age_pers',
       'nbr_salarie', 'ca_2017', 'ca_2018', 'ca_2019', 'pays_struc1',
       'date_struc', 'prix_struc', 'linkedin_struc', 'email_pers'],
      dtype='object')

In [38]:
full_corpus = ' '.join(df[df['categorie'] == 'agriculture'][['nom_struc', 'categorie']].join(Chalenge.descriptions_trad)[['prez_struc', 'prez_produit_struc']].fillna('').agg(sum, axis=1).values.tolist())

full_corpus = model_lib.Lemm(X = full_corpus, file = 'stopwords_agri.txt').X

collections.Counter(full_corpus.split()).most_common(5000)

In [32]:
import chalenge1000
import pandas
from rake_nltk import Rake
import model_lib
C = chalenge1000.Native()
h = C.descriptions_trad
k = C.X
k = k[k['categorie'] == 'agriculture']
M = k.join(h)
M['description'] = M[['prez_struc', 'prez_produit_struc']].agg(sum, axis = 1)

In [58]:
import random
i = random.choice(range(0,698))

r= Rake()
t= M['description'].values.tolist()[i]
print(M['nom_struc'].values.tolist()[i])
print(t)
r.extract_keywords_from_text(t)
print(r.get_ranked_phrases_with_scores()[:5])
t_1 = model_lib.Lemm(X = t).H
r= Rake()
r.extract_keywords_from_text(t_1)
print(r.get_ranked_phrases_with_scores()[:5])



AGROPASE-CI
The FERME D'ICI located in Sassandra in the south west of Ivory Coast is a project of the AGROPASE-CI structure. Created in 2018, the farm currently produces broilers and hybrids; it also plans to produce chickens of local breeds called bicycles. We regularly produce maize on an area of two hectares to feed the subjects and reduce our production load, which allows us to offer competitive products.With a view to contributing to the achievement of food security that will allow all people to have, at any time, the physical, social and economic opportunity to obtain sufficient, healthy and nutritious food in the region of Sassandra, Cote d'Ivoire and on our continent;our poultry project called "LA FERME D'ICI" buys by bands, chicks of meat breeds and hybrids which are treated and fed on the farm by a permanent poultry farmer using corn from our plantation combined with concentrated feed purchased with IVOGRAIN (industrialist manufacturer of cattle feed).Thanks to this strategy 

In [34]:
r.extract_keywords_from_text(t)
print(r.get_ranked_phrases_with_scores())
t_1 = model_lib.Lemm(X = t).H
r= Rake()
r.extract_keywords_from_text(t_1)
print(r.get_ranked_phrases_with_scores())

[(44.0, 'small family owned agricultural export company aim'), (34.5, 'provide value added supply chain services'), (11.0, 'small scale farmers'), (10.5, 'make exportation services'), (10.0, 'food agricultural products'), (9.0, 'packaged livestock products'), (5.5, 'agricultural produce'), (4.0, 'snail meat'), (4.0, 'rural areas'), (4.0, 'imperishable cash'), (4.0, 'fresh tomatoes'), (3.5, 'well perserved'), (1.5, 'well'), (1.5, 'produce'), (1.0, 'world'), (1.0, 'selling'), (1.0, 'purchase'), (1.0, 'processed'), (1.0, 'perishable'), (1.0, 'nationally'), (1.0, 'international'), (1.0, 'farm'), (1.0, 'facilitating'), (1.0, 'connecting'), (1.0, 'coffee'), (1.0, 'cocoa'), (1.0, 'cameroon'), (1.0, 'buying'), (1.0, 'basically'), (1.0, 'bananas'), (1.0, 'atabong')]
[(44.66666666666667, 'small family owned agricultural export company aim'), (34.5, 'provide value added supply chain services'), (25.0, 'cocoa coffee bananas fresh tomatoes'), (23.666666666666668, 'imperishable cash food agricultura

In [35]:
def rake_description(e):
    try:
        r= Rake()
        r.extract_keywords_from_text(e)
        return r.get_ranked_phrases_with_scores()
    except:
        return ''

In [37]:
M['rake'] = M['description'].map(rake_description)

l = M['rake'].values.tolist()

flat_list = [item for sublist in l for item in sublist]

flat_list = sorted(flat_list)[::-1]
import collections
collections.Counter([b for a,b in flat_list]).most_common(100)

In [57]:
M.keys()

Index(['nom_struc', 'categorie', 'cat_struc', 'cat_autre_struc', 'age_pers',
       'nbr_salarie', 'ca_2017', 'ca_2018', 'ca_2019', 'pays_struc1',
       'date_struc', 'prix_struc', 'linkedin_struc', 'email_pers',
       'prez_struc', 'prez_produit_struc', 'prez_marche_struc',
       'prez_zone_struc', 'prez_objectif_struc', 'prez_innovante_struc',
       'prez_duplicable_struc', 'prez_durable_struc', 'description', 'rake'],
      dtype='object')

In [59]:
from sklearn.metrics import homogeneity_completeness_v_measure, f1_score
homogeneity_completeness_v_measure(x,y)