In [1]:
from sklearn.model_selection import train_test_split
import chalenge1000
import model_lib
import collections
from IPython.display import display, HTML
import pandas


Chalenge = chalenge1000.Native()
df = Chalenge.X

In [2]:
def vocab(element):
    if type(element) == str:
        return ' '.join(set(element.split(' ')))
    else:
        return ''

def _agriculture__(element):
    if element in agriculture:
        return True
    else:
        return False
    
col_count = lambda x: ' '.join([a for a,b in collections.Counter(x.split()).most_common(10000)])

lemm = lambda x: model_lib.Lemm(x,file = 'stopwords_agri.txt').X

def df_fmt(df):
    df['corpus'] = df.join(Chalenge.descriptions_trad)[['prez_struc','prez_produit_struc']].agg(sum, axis = 1)
    df['type'] = df['corpus'].map(type)
    df['corpus'] = df['corpus'].map(vocab)
    df['corpus_lemm'] = df['corpus'].map(lemm)
    df['collection_counter'] = df['corpus_lemm'].map(col_count)
    agr = {e for e in df['cat_autre_struc'].dropna().values.tolist() if 'agr' in e}
    alim = {e for e in df['cat_autre_struc'].dropna().values.tolist() if 'alim' in e}
    agriculture = alim.union(agr)
    #df = df[df['type'] ==str]
    return df

def perf_lda(model_parameters,train, test):
    
    def get_topic(e):
        if e >0:
            return 'agriculture'
        if e<0:
            return 'media'
    
    def true_positve(a,b):
        if a == b:
            return 1
        else:
            return 0
        
    model = model_lib.Models(**model_parameters)

    models = model.run_model_LDA(train.values.tolist())

    predict = pandas.DataFrame(model.reverse_lda(test.values.tolist(), models) , index =test.index)
    #display(predict)

    predict = predict.join(df['categorie'])
    predict[0] = predict[0]*100
    predict[1] = predict[1]*100
    predict['topic'] = predict[0]-predict[1]
    #display(predict)

    predict['predict_label'] = predict['topic'].map(get_topic)
    #display(predict)

    predict['classification_kpi'] = predict.apply(lambda x: true_positve(x['categorie'],x['predict_label']), axis= 1)
    #display(predict)

    agri = predict[predict['categorie'] == 'agriculture']
    print(model_parameters)
    print('precision {}'.format(agri['classification_kpi'].sum()/len(agri)*100))
    model_parameters.update({'precision':agri['classification_kpi'].sum()/len(agri)*100, 'vocabulaire': models['LDA']['feature_names']})
    errors = agri[agri['classification_kpi'] == 0].sort_values('topic')
    collections.Counter(' '.join(errors.join(text_corpus)['collection_counter'].values.tolist()).split()).most_common(10)
    return model_parameters


In [3]:
df = df_fmt(df)

text_corpus = df[(df['categorie'].isin(['agriculture','media'])) & (df['type'] ==str)]['collection_counter']
text_corpus_label = df[df['categorie'].isin(['agriculture','media']) & (df['type'] ==str)][['categorie','collection_counter']]


In [4]:
test,train = train_test_split(text_corpus_label, test_size = 0.2)

print(len(text_corpus), len(test), len(train))

display(train.groupby('categorie').count())

display(text_corpus_label.groupby('categorie').count())

train = train['collection_counter']
test = test['collection_counter']
model_parameters = {'n_components' : 2, 'n_top_words' : 20, 'n_features' : 20000}
_ = perf_lda(model_parameters, train, test)

787 629 158


Unnamed: 0_level_0,collection_counter
categorie,Unnamed: 1_level_1
agriculture,139
media,19


Unnamed: 0_level_0,collection_counter
categorie,Unnamed: 1_level_1
agriculture,695
media,92


Extracting tf features for LDA...
done in 0.022s.

Fitting LDA models with tf features, n_features=20000...
done in 0.771s.

Topics in LDA model:
Topic #0: market production make product produce create farm agricultural local food farmer rural plant young agriculture order develop service technology harvest
Topic #1: product service make local create sustainable management community agricultural social organization food digital train agriculture farmer produce market environment communication

{'n_components': 2, 'n_top_words': 20, 'n_features': 20000}
precision 71.22302158273382


In [5]:
len(set(train.agg(sum, axis = 0).split()))

4345

In [6]:
test,train = train_test_split(text_corpus_label, test_size = 0.2)
train = train['collection_counter']

model_parameters = {'n_components' : 2, 'n_top_words' : 20, 'n_features' : 20000, 'doc_topic_prior':0.1}

model = model_lib.Models(**model_parameters)

In [7]:
m = model.run_model_LDA(train.values.tolist())
m['LDA']['components_kpi']

Extracting tf features for LDA...
done in 0.035s.

Fitting LDA models with tf features, n_features=20000...
done in 0.500s.

Topics in LDA model:
Topic #0: product production make market food produce create agricultural farm farmer local agriculture develop young sustainable order service plant community technology
Topic #1: service field local market production communication food digital choice advertiser propose advertising computer try create access distribution crop association individual



{'abandon': (1.3648474691663217, 0.7897359943915137),
 'ability': (2.857900814621704, 0.783639821166422),
 'abroad': (2.6482845252977327, 0.7803019860423869),
 'absorb': (2.1970652000993516, 0.7412515880470447),
 'abundant': (2.2812672959323845, 0.8059342178274288),
 'accept': (1.9346840022212106, 1.1452803468755273),
 'access': (11.975123278595943, 2.5380793402138906),
 'accessibility': (2.214884471943666, 0.8835382051168208),
 'accessible': (5.934785976449814, 1.0835024732396825),
 'accompaniment': (2.290401769780858, 0.7937583545349796),
 'accompany': (5.690431130367072, 2.3979044617526144),
 'accordance': (2.1922000412201808, 0.8295138509606264),
 'account': (3.1221652822050303, 2.4681133741462498),
 'accountant': (2.8534431472770274, 0.7613272644597138),
 'accurate': (1.4223163490855066, 0.7665203118179942),
 'achievement': (5.175979375611329, 0.8642668082108347),
 'acid': (1.375794667865054, 0.8313085877079021),
 'acknowledge': (1.392405357194874, 0.7537088751716251),
 'acquire':

In [61]:
m['LDA']['model'].components_.shape

(2, 1491)

In [62]:
m['LDA']['model'].components_[0].argsort()

array([ 273, 1382,  708, ...,  831,  802, 1068])

In [63]:
len(m['LDA']['feature_names'])

1491

In [64]:
m['LDA']['feature_names'][1061]

'private'

In [71]:
m['LDA']['model'].components_[0]

array([1.13438378, 0.80462239, 2.7383608 , ..., 0.90788585, 5.03423667,
       0.85673958])

In [66]:
min(m['LDA']['model'].components_[0])

0.6366892855292235

In [72]:
#[(m['LDA']['feature_names'][i],k[0],k[1]) for i,k in enumerate( [(a,b) for a, b in zip(m['LDA']['model'].components_)])]

In [79]:
components = {m['LDA']['feature_names'][i]:a for i,a in enumerate(zip(*m['LDA']['model'].components_))}

In [80]:
components['agriculture']

(2.1092081709677477, 14.386921219600769)