In [1]:
import time
ini=time.time()

import numpy as np
import pandas as pd
from sklearn import metrics
from pandas.tseries.offsets import DateOffset
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

imp_fert=pd.read_csv('/home/andre301267/git/Pricing-Fertilizer/DB/temp/imp_fert_ncm_quanti.csv')
exp_soja=pd.read_csv('/home/andre301267/git/Pricing-Fertilizer/DB/temp/exp_soja_ncm_quanti.csv')

comex=pd.concat([imp_fert,exp_soja]).sort_values(by=['year','month'])\
.drop(columns=['US$_freight','US$_insurance','cif_kg','fob_kg','preço'])

imp_fert=exp_soja=0


# Save real begins and ends
comex['date']=pd.to_datetime(comex.year.astype(str)+'-'+comex.month.astype(str)+'-01')
date_min=comex.date.min()
date_max=comex.date.max()

# filtra km deixando passar o maior km de cada produto
filtra_km=comex[['Product','km']].drop_duplicates().sort_values(by=['Product','km'],ascending=False)\
    .drop_duplicates(subset='Product')
comex=comex.merge(filtra_km)

comex=comex.groupby(by=['year','month','Product','UF'],observed=True)[['kg','US$_fob']].sum().reset_index()
comex['date']=pd.to_datetime(comex.year.astype(str)+'-'+comex.month.astype(str)+'-01')

# Calendar full
years=pd.DataFrame({'year':np.arange(date_min.year, date_max.year+1)}).assign(key=0)
months=pd.DataFrame({'month':np.arange(1,13)}).assign(key=0)
calendar=years.merge(months,on='key')
calendar_full=comex[['Product','UF']].drop_duplicates().assign(key=0).merge(calendar,on='key').drop('key',axis=1)

# Comex merge calendar full
comex_full=comex.merge(calendar_full,how='outer')

# Saprse Matrix
comex_full[['month','Product','UF']]=comex_full[['month', 'Product','UF']].astype(str)
comex_sprs=comex_full.groupby(by=['year','month','Product','UF'],observed=False)[['kg','US$_fob']].sum().reset_index()

# Prune
comex_sprs['date']=pd.to_datetime(comex_sprs.year.astype(str)+'-'+comex_sprs.month.astype(str)+'-01')
comex_sprs=comex_sprs[(comex_sprs.date>=date_min)&(comex_sprs.date<=date_max)]


# Indentify occurances
comex_sprs=comex_sprs.assign(oc=0)
comex_sprs.loc[comex_sprs.kg>0,'oc']=1

# Df for tests
date_max_test=comex_sprs.date.max()-DateOffset(months=11)
comex_sprs_tests=comex_sprs[comex_sprs.date<=date_max_test]

In [2]:
comex_sprs_tests=comex_sprs_tests.rename(columns={'US$_fob':'fob'})

In [3]:
cst=comex_sprs_tests

In [4]:
# hipotesys to test
hip_list = ['lonely','conj']

# General parameters
years_rept=20
prdt_foco=['MOP']
uf_foco=['BA']
qtt_foco=['kg','fob']
month_foco=[(comex.date.max()+DateOffset(months=1)).month]
train_max=10 # lenght
train_min=10 # lenght

            

# Parameters for lonely processing
prdt_lonely=prdt_foco
uf_lonely=uf_foco

month_lonely=month_foco
degree=1

# Parameters for joint processing
prdt_conj=prdt_foco+['MAP','Soy Group']
uf_conj=uf_foco+['GO','MT','TO']
m_ext=3
m_ini=month_foco[0]-m_ext
month_conj=np.arange(m_ini,month_foco[0]+1)
month_conj[month_conj<=0]+=12
max_depth=7
max_features=None

# Dict for tests
dict={'lonely':[prdt_lonely, uf_lonely, month_lonely],
      'conj'  :[prdt_conj  , uf_conj,   month_conj]}


# MODELS

# Classfication
from sklearn.ensemble import RandomForestClassifier as rfc
model_rfc=rfc(max_depth=max_depth,max_features=max_features)

# Regression

# RFR
from sklearn.ensemble import RandomForestRegressor as rfr
model_rfr=rfr(max_depth=max_depth,max_features=max_features)

# Polynomial Linear
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
polynomial_features = PolynomialFeatures(degree=degree)
linear_regression = LinearRegression()
model_plr = make_pipeline(polynomial_features, linear_regression)

In [5]:
# TESTS
results_c=pd.DataFrame()
results_r=pd.DataFrame()

for f in hip_list:

    h=dict[f]
    
    # Filtra conforme o teste a ser feito
    comex_sprs_tests_hip=comex_sprs_tests[
    (comex_sprs_tests.Product.isin(h[0]))&
    (comex_sprs_tests.UF.isin(h[1]))&
    (comex_sprs_tests.month.astype(int).isin(h[2]))].copy()

    
    comex_t2=comex_sprs_tests_hip.copy()
    

    # Repetitions of the test troughout the years
    for r in range(1,years_rept+1):
        
        # Train & test
        train=comex_t2[comex_t2.date<comex_t2.date.max()]
        
        test=comex_t2[comex_t2.date==comex_t2.date.max()]
        test=test[(test.UF.isin(uf_foco))&(test.Product.isin(prdt_foco))]
        
        # Verify test conditions to classify
        if (len(train.year.unique())>=train_min):
                
            # Unify to format
            tt=pd.concat([train.assign(tipo='train'),test.assign(tipo='test')])
            tt.date=tt.date.astype(int) # Format date
            tt_dumm=pd.get_dummies(tt, prefix_sep='~') # Format categories
            
            # Train & test
            train_c=tt_dumm[tt_dumm['tipo~'+'train']==True].drop(columns=['tipo~'+'train','tipo~'+'test'])
            test_c=tt_dumm[tt_dumm['tipo~'+'test']==True].drop(columns=['tipo~'+'train','tipo~'+'test'])
            # X & y for classify
            
            X_train=train_c.drop(columns=['oc']+qtt_foco)
            y_train=train_c.oc
            X_test=test_c.drop(columns=['oc']+qtt_foco)
            
            # Classify
            
            model_c=model_rfc
            model_c.fit(X_train.drop(columns='year'),y_train)
            test_c['oc_pred']=model_c.predict(X_test.drop(columns='year'))
            
            # X & y to regress
            train_r=train_c[train_c.oc==1]
            test_r=test_c[(test_c.oc==1)&(test_c.oc_pred==1)]
            
            # Verify test conditions to regeress
            if (len(train_r.year.unique())>=train_min) & (len(test_r)>0):
                
                X_train=train_r.drop(columns=['oc']+qtt_foco)
                y_train=train_r[qtt_foco]
                X_test=test_r.drop(columns=['oc','oc_pred']+qtt_foco)

                # Verify type of regress to be done
                if f == 'conj':
                    model_r=model_rfr
                else:
                    model_r=model_plr
                    
                # Regress
                model_r.fit(X_train.drop(columns='year'),y_train)
                #test_r=X_test
                w=model_r.predict(X_test.drop(columns='year'))[0]
                for x in range(len(qtt_foco)):
                    test_r[str(qtt_foco[x])+'_pred']=w[x]
                    
                # Identify the test done in the results                
                test_r=test_r.assign(hip=f)
                
    
                # Save results of the running test
                results_r=pd.concat([results_r,test_r]).fillna(False).reset_index(drop=True)
            
            # Identify the test done in the results
            test_c=test_c.assign(hip=f)
            # Save results of the running test
            results_c=pd.concat([results_c,test_c]).fillna(False).reset_index(drop=True)
                
        # Drop the date that was just test
        comex_t2=comex_t2[(comex_t2.date<=comex_t2.date.max()-DateOffset(years=1))]

In [6]:
results_c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 18 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   year               35 non-null     int64  
 1   kg                 35 non-null     float64
 2   fob                35 non-null     float64
 3   date               35 non-null     int64  
 4   oc                 35 non-null     int64  
 5   month~9            35 non-null     bool   
 6   Product~MOP        35 non-null     bool   
 7   UF~BA              35 non-null     bool   
 8   oc_pred            35 non-null     int64  
 9   hip                35 non-null     object 
 10  month~6            35 non-null     bool   
 11  month~7            35 non-null     bool   
 12  month~8            35 non-null     bool   
 13  Product~MAP        35 non-null     bool   
 14  Product~Soy Group  35 non-null     bool   
 15  UF~GO              35 non-null     bool   
 16  UF~MT              35 non-nu

In [7]:
def Back_from_dummies(comex_dumm):
    o=comex_dumm[comex_dumm.columns[~comex_dumm.columns.str.contains('~')]]    
    prdt=pd.from_dummies(comex_dumm[comex_dumm.columns[comex_dumm.columns.str.contains('Product')]],sep='~')    
    month=pd.from_dummies(comex_dumm[comex_dumm.columns[comex_dumm.columns.str.contains('month')]],sep='~')    
    uf=pd.from_dummies(comex_dumm[comex_dumm.columns[comex_dumm.columns.str.contains('UF')]],sep='~')    
    comex_dumm=pd.concat([o,prdt,month,uf],axis=1)
    return comex_dumm

def date_format(comex):
    comex.date=comex.date.astype('datetime64[ns]')
    return comex

results_c=Back_from_dummies(results_c)
results_r=Back_from_dummies(results_r)

results_c=date_format(results_c)
results_r=date_format(results_r)

In [8]:
for h in hip_list:
    t=results_c[results_c.hip==h]    
    print(h+'_hipotesys__scores:')
    print('accuracy',round(metrics.accuracy_score(t.oc,t.oc_pred),1))
    print('precision',round(metrics.precision_score(t.oc,t.oc_pred),1))
    print('recall',round(metrics.recall_score(t.oc,t.oc_pred),1))
    print()

lonely_hipotesys__scores:
accuracy 0.9
precision 0.9
recall 0.9

conj_hipotesys__scores:
accuracy 0.9
precision 0.9
recall 1.0



In [9]:
w=results_r[results_r.hip=='lonely'][['date','hip']].merge(results_r[results_r.hip=='conj'][['date','hip']], on='date',how='outer')

In [10]:
date_sem_regres_lonely=w[w.hip_x.isna()].date

In [16]:
results_c.sort_values(by=['year','hip'])

Unnamed: 0,year,kg,fob,date,oc,oc_pred,hip,Product,month,UF
34,2006,6309460.0,1089283.0,2006-09-01,1,1,conj,MOP,9,BA
33,2007,20483600.0,5406284.0,2007-09-01,1,1,conj,MOP,9,BA
16,2007,20483600.0,5406284.0,2007-09-01,1,1,lonely,MOP,9,BA
32,2008,23530900.0,19998482.0,2008-09-01,1,1,conj,MOP,9,BA
15,2008,23530900.0,19998482.0,2008-09-01,1,1,lonely,MOP,9,BA
31,2009,25277680.0,13850580.0,2009-09-01,1,1,conj,MOP,9,BA
14,2009,25277680.0,13850580.0,2009-09-01,1,1,lonely,MOP,9,BA
30,2010,38739440.0,13100733.0,2010-09-01,1,1,conj,MOP,9,BA
13,2010,38739440.0,13100733.0,2010-09-01,1,1,lonely,MOP,9,BA
29,2011,52377210.0,27780791.0,2011-09-01,1,1,conj,MOP,9,BA


In [15]:
results_r.sort_values('year')

Unnamed: 0,year,kg,fob,date,oc,oc_pred,kg_pred,fob_pred,hip,Product,month,UF
27,2006,6309460.0,1089283.0,2006-09-01,1,1,40612530.0,7426923.0,conj,MOP,9,BA
26,2007,20483600.0,5406284.0,2007-09-01,1,1,21648330.0,4566742.0,conj,MOP,9,BA
25,2008,23530900.0,19998482.0,2008-09-01,1,1,36159270.0,24496770.0,conj,MOP,9,BA
24,2009,25277680.0,13850580.0,2009-09-01,1,1,29926100.0,20507580.0,conj,MOP,9,BA
23,2010,38739440.0,13100733.0,2010-09-01,1,1,15887580.0,6495945.0,conj,MOP,9,BA
10,2011,52377210.0,27780791.0,2011-09-01,1,1,27065500.0,14401980.0,lonely,MOP,9,BA
22,2011,52377210.0,27780791.0,2011-09-01,1,1,44829370.0,22114680.0,conj,MOP,9,BA
21,2013,23854130.0,9121270.0,2013-09-01,1,1,39260080.0,17341570.0,conj,MOP,9,BA
20,2014,30910480.0,10295691.0,2014-09-01,1,1,48683210.0,17295700.0,conj,MOP,9,BA
9,2014,30910480.0,10295691.0,2014-09-01,1,1,31944150.0,18675610.0,lonely,MOP,9,BA


In [11]:
date_sem_regres_lonely

0   2006-09-01
1   2007-09-01
2   2008-09-01
3   2009-09-01
4   2010-09-01
6   2013-09-01
Name: date, dtype: datetime64[ns]

In [12]:
# Metrics
rse=metrics.root_mean_squared_error

for y in qtt_foco:
    col_drop=[x for x in qtt_foco if x !=y][0]
        
    for hip in hip_list:
        l=[]
        t=results_r[
        (results_r.hip==hip)&
        (results_r.Product.isin(prdt_foco))&
        (results_r.UF.isin(uf_foco))
        ].drop(columns=col_drop)
        for i in range(1000):
            tt=t.sample(len(t),replace=True)
            l=l+[rse(
                tt[y],
                tt[y+'_pred']
            )]
        print('hipotesys ',hip,y,'- rse: ',round(sum(l)/len(l)/10**6),'Kt, com',len(t), 'amostras.')
        print()

hipotesys  lonely kg - rse:  31 Kt, com 11 amostras.

hipotesys  conj kg - rse:  23 Kt, com 17 amostras.

hipotesys  lonely fob - rse:  11 Kt, com 11 amostras.

hipotesys  conj fob - rse:  7 Kt, com 17 amostras.

