In [29]:
import pandas as pd

In [30]:
df_train_feng = pd.read_csv('/usr/src/tp2-1/210215_tp2_train_feng.csv')

In [31]:
df_train_feng.shape

(9739, 2825)

In [32]:
X = df_train_feng.drop(columns = 'Stage')
y = df_train_feng['Stage']

In [33]:
X.shape

(9739, 2824)

In [34]:
from sklearn.model_selection import TimeSeriesSplit, ParameterGrid
import numpy as np

In [35]:
from sklearn.ensemble import RandomForestRegressor

splits = TimeSeriesSplit(n_splits=3, max_train_size=365*2)
rfr = RandomForestRegressor()

# Create a dictionary of hyperparameters to search
rfr_grid = {"n_estimators": [10, 20, 30, 40, 50, 100], 
        'max_depth': [3, 5, 8, 10, 12], 
        'max_features': [4, 8, 16, 32, 59], 
        'random_state': [123]}

rfr_paramGrid = ParameterGrid(rfr_grid)

def TimeSplit_ModBuild(model, paramGrid, splits, X, y):
    from sklearn.model_selection import TimeSeriesSplit
    from sklearn.metrics import mean_squared_error

    #Loop over each time split and for each
    for train_index, val_index in splits.split(X):
        _X_train_ = X.iloc[train_index]
        _y_train_ = y.iloc[train_index]
        _X_val_ = X.iloc[val_index]
        _y_val_ = y.iloc[val_index]

        train_scores = []
        val_scores = []
        #models = []
        
        # Loop through the parameter grid, set the hyperparameters, and save the scores
        for g in paramGrid:
            model.set_params(**g)
            model.fit(_X_train_, _y_train_)
            p_train = model.predict(_X_train_)
            p_val = model.predict(_X_val_)
            score_train = np.mean(mean_squared_error(_y_train_, p_train))
            score_val = np.mean(mean_squared_error(_y_val_, p_val))
            train_scores.append(score_train)
            val_scores.append(score_val)
            #models.append(model)
            best_idx = np.argmin(val_scores)
            
        print("Best-Fold HyperParams:: ", paramGrid[best_idx])
        print("Best-Fold Train RMSE: ", train_scores[best_idx])
        print("Best-Fold Val RMSE: ",val_scores[best_idx])
        print("\n")
        
    #Return most recent model
    return train_scores, val_scores, best_idx


CV_rfr_tup = TimeSplit_ModBuild(rfr, rfr_paramGrid, splits, X, y)

Best-Fold HyperParams::  {'random_state': 123, 'n_estimators': 100, 'max_features': 59, 'max_depth': 12}
Best-Fold Train RMSE:  0.07134169933976826
Best-Fold Val RMSE:  0.17896594285158016


Best-Fold HyperParams::  {'random_state': 123, 'n_estimators': 10, 'max_features': 59, 'max_depth': 12}
Best-Fold Train RMSE:  0.08211561213119523
Best-Fold Val RMSE:  0.18412674895761802


Best-Fold HyperParams::  {'random_state': 123, 'n_estimators': 20, 'max_features': 59, 'max_depth': 10}
Best-Fold Train RMSE:  0.11715150532640987
Best-Fold Val RMSE:  0.16055445807660892




In [36]:
df_frio_frio_test = pd.read_csv('/usr/src/tp2-1/210215_tp2_test_feng.csv')

In [37]:
df_frio_frio_test

Unnamed: 0.1,Unnamed: 0,Opportunity_ID,level_0,index,Region_APAC,Region_Americas,Region_EMEA,Region_Japan,Region_Middle East,Territory_Albania,...,Total_Amount,Total_Taxable_Amount,"Pricing, Delivery_Terms_Quote_Appr","Pricing, Delivery_Terms_Approved",Bureaucratic_Code_0_Approval,Bureaucratic_Code_0_Approved,Submitted_for_Approval,Quote_Type,Delivery_Year,TRF
0,0,10689,2261.333333,2261.333333,0.0,0.0,1.0,0.0,0.0,0.0,...,122473.00,367419.0,1.0,1.0,1.0,1.0,0.0,0.0,2019.0,0.0
1,1,10690,2277.000000,2277.000000,0.0,0.0,1.0,0.0,0.0,0.0,...,151556.70,757783.5,1.0,1.0,1.0,1.0,0.0,0.0,2019.0,0.2
2,2,10691,2274.000000,2274.000000,0.0,1.0,0.0,0.0,0.0,0.0,...,21037.50,21037.5,1.0,1.0,0.0,0.0,0.0,0.0,2019.0,0.0
3,3,10692,2269.500000,2269.500000,0.0,1.0,0.0,0.0,0.0,0.0,...,361517.75,2169106.5,1.0,1.0,1.0,0.0,0.0,0.0,2019.0,1.0
4,4,10693,2268.000000,2268.000000,0.0,1.0,0.0,0.0,0.0,0.0,...,5752.50,5752.5,0.0,0.0,0.0,0.0,0.0,0.0,2019.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1562,1562,12364,2462.500000,2462.500000,0.0,0.0,1.0,0.0,0.0,0.0,...,73875.00,147750.0,1.0,1.0,0.0,0.0,0.0,0.0,2019.0,0.0
1563,1563,12365,2464.000000,2464.000000,0.0,1.0,0.0,0.0,0.0,0.0,...,45054.90,45054.9,1.0,1.0,0.0,0.0,0.0,0.0,2019.0,0.0
1564,1564,12366,2465.000000,2465.000000,0.0,1.0,0.0,0.0,0.0,0.0,...,100122.00,100122.0,1.0,1.0,1.0,1.0,0.0,0.0,2019.0,0.0
1565,1565,12367,2458.000000,2458.000000,0.0,1.0,0.0,0.0,0.0,0.0,...,143220.00,143220.0,1.0,1.0,0.0,0.0,0.0,0.0,2019.0,0.0


In [38]:
best_rfr_idx = CV_rfr_tup[2]

In [39]:
best_rfr_grid = rfr_paramGrid[best_rfr_idx]

best_rfr = RandomForestRegressor()
best_rfr.set_params(**best_rfr_grid)


RandomForestRegressor(max_depth=10, max_features=59, n_estimators=20,
                      random_state=123)

In [40]:
best_rfr.fit(X, y)

RandomForestRegressor(max_depth=10, max_features=59, n_estimators=20,
                      random_state=123)

In [41]:
predicts = best_rfr.predict(df_frio_frio_test)


In [42]:
predicts

array([0.65775826, 0.66586637, 0.59014186, ..., 0.50074244, 0.53576184,
       0.3026796 ])

In [51]:
output = pd.DataFrame({'Opportunity_ID':df_frio_frio_test.Opportunity_ID, 'Target': predicts})


In [77]:
final=output.groupby('Opportunity_ID').mean()
final

Unnamed: 0_level_0,Target
Opportunity_ID,Unnamed: 1_level_1
10689,0.603025
10690,0.547861
10691,0.561362
10692,0.464678
10693,0.619216
...,...
12364,0.648920
12365,0.492856
12366,0.509296
12367,0.536153


In [52]:
output.to_csv('/usr/src/tp2-1/210216_random_forest_TimeSeriesSplit.csv', index=False)

In [None]:
'''
    de aqui en adelante eran pruebas para ver si puedo determinar que features tiene mayor
    peso para mejorar el feng
'''

In [14]:
import matplotlib.pyplot as plt

In [15]:
rfr.feature_importances_

array([0.02848471, 0.        , 0.        , ..., 0.00563403, 0.02752726,
       0.03078427])

In [16]:
score

[[1, 110, 60, 5, 50, -0.07108437660405165],
 [2, 110, 60, 5, 50, -0.13560651190189854],
 [3, 110, 60, 5, 50, -0.04803250299787076],
 [4, 110, 60, 5, 50, -0.07579638483298745],
 [5, 110, 60, 5, 50, 0.0006023351269941468],
 [6, 110, 60, 5, 50, -0.04883778692138163],
 [7, 110, 60, 5, 50, 0.06611411272943679],
 [8, 110, 60, 5, 50, 0.0351684624852131],
 [9, 110, 60, 5, 50, 0.013380949354706573],
 [10, 110, 60, 5, 50, -0.062007784449307435]]

In [17]:
plt.bar(X.columns, rfr.feature_importances_)
plt.xlabel('features')
plt.ylabel('importance ')
plt.show()

KeyboardInterrupt: 