In [1]:
%matplotlib inline
import pickle
import time
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

## 1.Treinamento

### 1.0 Funções de Teste

In [5]:
def data_sep(portfolio, size = 0.3):
    portfolio_train, portfolio_test = train_test_split(portfolio, test_size=size, random_state=69420)
    return portfolio_train, portfolio_test

In [6]:
def check(recommend,test):
    return test.isin(recommend).sum()/test.shape[0]

### 1.1 MiniBatch KMeans

In [7]:
df = pd.read_csv('1.pre-processamento/df_preprocessed.csv')
df_p1 = pd.read_csv('1.pre-processamento/portfolio1_preprocessed.csv')
df_p2 = pd.read_csv('1.pre-processamento/portfolio2_preprocessed.csv')
df_p3 = pd.read_csv('1.pre-processamento/portfolio3_preprocessed.csv')

In [8]:
with open("1.pre-processamento/pca.pickle",'rb') as f:
    pca_ft = pickle.load(f)

In [9]:
start = time.time()
kmeans= MiniBatchKMeans(n_clusters =500, random_state=69420, batch_size = 50000).fit(pca_ft)
end = time.time()
print(f'Tempo de treino: {end - start}')

Tempo de treino: 42.299774169921875


In [10]:
df['kmeans'] = kmeans.labels_

Salvando os resultados do treino:

In [11]:
with open('2.treinamento/df_kmeans.pickle','wb') as f:
    pickle.dump(df, f)

with open('2.treinamento/kmeans.pickle','wb') as f:
    pickle.dump(kmeans, f)

### 1.2 Nearest Neighbors

In [49]:
df = pd.read_csv('1.pre-processamento/df_preprocessed.csv')
df_p1 = pd.read_csv('1.pre-processamento/portfolio1_preprocessed.csv')
df_p2 = pd.read_csv('1.pre-processamento/portfolio2_preprocessed.csv')
df_p3 = pd.read_csv('1.pre-processamento/portfolio3_preprocessed.csv')

In [50]:
df.set_index('id',inplace=True)
df_p1.set_index('id',inplace=True)
df_p2.set_index('id',inplace=True)
df_p3.set_index('id',inplace=True)

In [57]:
start = time.time()
nn = NearestNeighbors(n_neighbors=8,metric='cosine')
nn.fit(df)
end =  time.time()
print(f'Tempo de Treino: {end-start}')

Tempo de Treino: 0.03216290473937988


## 2) Avaliação

###  2.1) MiniBatch KMeans

In [26]:
with open('2.treinamento/df_kmeans.pickle','rb') as f:
    df = pickle.load(f)



df_p1 = pd.read_csv('1.pre-processamento/portfolio1_preprocessed.csv')
df_p2 = pd.read_csv('1.pre-processamento/portfolio2_preprocessed.csv')
df_p3 = pd.read_csv('1.pre-processamento/portfolio3_preprocessed.csv')

In [27]:
def get_clusters(df,id_cluster):
    result = []
    for client in df.iterrows():
        client_id = client[1]['id']
        client_cluster = id_cluster[id_cluster['id'] == client_id]['kmeans'].values[0]
        result.append({'cluster_id':client_cluster, 'id': client_id})
    return result

In [28]:
def kmeans_lead_generator(df,id_cluster):
    clusters = get_clusters(df,id_cluster)
    leads = pd.DataFrame(columns=['id'])
    for cluster in clusters:
        recommendation = id_cluster[cluster['cluster_id'] == id_cluster['kmeans']]['id']
    return recommendation

In [34]:
p1 = pd.DataFrame(df_p1['id'], columns=['id'])
p2 = pd.DataFrame(df_p2['id'], columns=['id'])
p3 = pd.DataFrame(df_p3['id'], columns=['id'])

p1_train, p1_test = data_sep(p1)
p2_train, p2_test = data_sep(p2)
p3_train, p3_test = data_sep(p3)



In [35]:
start = time.time()
kmeans_result1 = kmeans_lead_generator(p1_train,df)
end = time.time()
time1 = end-start
score1 = check(kmeans_result1,p1_test['id'])



start = time.time()
kmeans_result2 = kmeans_lead_generator(p2_train,df)
end = time.time()
time2 = end-start
score2 = check(kmeans_result2,p2_test['id'])


start= time.time()
kmeans_result3 = kmeans_lead_generator(p3_train,df)
end = time.time()
time3 = end-start
score3 = check(kmeans_result3,p3_test['id'])


print(f'Portfolio 1 - Tempo: {time1} - Score: {score1}')
print(f'Portfolio 2 - Tempo: {time2} - Score: {score2}')
print(f'Portfolio 3 - Tempo: {time3} - Score: {score3}')

Portfolio 1 - Tempo: 11.690115928649902 - Score: 0.005988023952095809
Portfolio 2 - Tempo: 11.51410436630249 - Score: 0.0
Portfolio 3 - Tempo: 5.361168622970581 - Score: 0.3875


### 2.2 Nearest Neighbors

In [36]:
df = pd.read_csv('1.pre-processamento/df_preprocessed.csv')
df_p1 = pd.read_csv('1.pre-processamento/portfolio1_preprocessed.csv')
df_p2 = pd.read_csv('1.pre-processamento/portfolio2_preprocessed.csv')
df_p3 = pd.read_csv('1.pre-processamento/portfolio3_preprocessed.csv')

In [37]:
df.set_index('id',inplace=True)
df_p1.set_index('id',inplace=True)
df_p2.set_index('id',inplace=True)
df_p3.set_index('id',inplace=True)

In [61]:
def neighbors_lead_generator(df_p):
    
    index_list = np.array([])
    
    for i in range(df_p.shape[0]):
        k_distances,k_indexes = nn.kneighbors(df_p.iloc[[i]])
        k_indexes = np.delete(k_indexes,[0])
        k_distances = np.delete(k_distances,[0])
        index_list=np.concatenate((index_list,k_indexes),axis=None)

    neighbors = []
    
    for i in range(len(index_list)):
        neighbors.append(df.iloc[int(index_list[i])].name)
    
    lead = pd.DataFrame(neighbors, columns=['id_result'])
    lead.drop_duplicates(keep='first',inplace=True,ignore_index=True)
    
    return lead

In [59]:
p1_train, p1_test = data_sep(df_p1)
p2_train, p2_test = data_sep(df_p2)
p3_train, p3_test = data_sep(df_p3)

In [60]:
start = time.time()
neighbors_result1 = neighbors_lead_generator(p1_train)
end =  time.time()
time1 = end-start
score1 = check(neighbors_result1['id_result'],p1_test.index)

start = time.time()
neighbors_result2 = neighbors_lead_generator(p2_train)
end =  time.time()
time2 = end-start
score2 = check(neighbors_result2['id_result'],p2_test.index)

start = time.time()
neighbors_result3 = neighbors_lead_generator(p3_train)
end =  time.time()
time3 = end-start
score3 = check(neighbors_result3['id_result'],p3_test.index)

print(f'Portfolio 1 - Tempo: {time1} - Score: {score1}')
print(f'Portfolio 2 - Tempo: {time2} - Score: {score2}')
print(f'Portfolio 3 - Tempo: {time3} - Score: {score3}')

(2709, 1)
(2138, 1)
(897, 1)
Portfolio 1 - Tempo: 109.80540156364441 - Score: 0.011976047904191617
Portfolio 2 - Tempo: 112.4558174610138 - Score: 0.45294117647058824
Portfolio 3 - Tempo: 57.78557729721069 - Score: 0.5125


Bem mais eficiente que o Minibatch KMeans!!!!!

'''
Teste do Streamlit
'''

In [2]:
!jupyter nbconvert   --to script 2.Treinamento-copy1.ipynb
!awk '!/ipython/' 2.Treinamento-copy1.py >  temp.py && mv temp.py app.py && rm 2.Treinamento-copy1.py
!streamlit run app.py

This application is used to convert notebook files (*.ipynb) to various other
formats.


Options
-------

Arguments that take values are actually convenience aliases to full
Configurables, whose aliases are listed on the help line. For more information
on full configurables, see '--help-all'.

--debug
    set log level to logging.DEBUG (maximize logging output)
--generate-config
    generate default config file
-y
    Answer yes to any questions instead of prompting.
--execute
    Execute the notebook prior to export.
--allow-errors
    Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
--stdin
    read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
--stdout
    Write notebook output to stdout instead of files.
--inplace
    Run nbconvert in place, overwriting 