In [123]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import log_loss

import xgboost as xgb
import pandas as pd
import numpy as np
from matplotlib import pyplot
from xgboost import plot_importance

In [124]:
train = pd.read_csv('train.csv', encoding='utf-8')
test = pd.read_csv('test.csv', encoding='utf-8')
train = train.loc[(train['Stage']=='Closed Won') | (train['Stage']=='Closed Lost')]

In [125]:
def set_dates(df):
    
    df.Account_Created_Date = pd.to_datetime(df.Account_Created_Date, errors='coerce')
    df.Opportunity_Created_Date = pd.to_datetime(df.Opportunity_Created_Date, errors='coerce')
    df.Planned_Delivery_Start_Date = pd.to_datetime(df.Planned_Delivery_Start_Date, errors='coerce')
    df.Planned_Delivery_End_Date = pd.to_datetime(df.Planned_Delivery_End_Date, errors='coerce')
    
set_dates(train)

In [126]:
def get_quarters(series):
    
    col = []
    for month in series:
        if month<=3:
            col.append('Q1')
        elif month<=6:
            col.append('Q2')
        elif month<=9:
            col.append('Q3')
        else:
            col.append('Q4')
            
    return col

In [166]:
def set_df(df):

    max = df.Account_Created_Date.max()
    df['Account_LifeSpan'] = max - df.Account_Created_Date
    df.Account_LifeSpan = df.Account_LifeSpan.astype('timedelta64[D]')

    df['Creation_Quarter'] = get_quarters(df.Opportunity_Created_Date.dt.month)
    df['Delivered_Hot_Season'] = ((((df.Region=='EMEA')|(df.Region=='Americas')|(df.Region=='Japan')|\
                                   ((df.Region=='APAC')&(df.Territory!='Australia')))\
                                      &\
                                 ((df.Delivery_Quarter=='Q2')|(df.Delivery_Quarter=='Q3')))\
                                      |
                                 ((df.Territory=='Australia')&((df.Delivery_Quarter=='Q1')|(df.Delivery_Quarter=='Q4'))))

set_df(train)

In [167]:
aux = train.loc[:, ['Region', 'Territory', 'Creation_Quarter', 'Delivered_Hot_Season', 'Stage', 'Opportunity_ID']]
aux.drop_duplicates(subset=['Opportunity_ID'], inplace=True)
aux.groupby('Delivered_Hot_Season').agg({'Stage':'value_counts'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Stage
Delivered_Hot_Season,Stage,Unnamed: 2_level_1
False,Closed Won,2609
False,Closed Lost,2339
True,Closed Won,2463
True,Closed Lost,2380


In [129]:
train.loc[:, ['Creation_Quarter', 'Opportunity_Created_Date']]

Unnamed: 0,Creation_Quarter,Opportunity_Created_Date
0,Q4,2015-12-07
1,Q4,2015-12-07
2,Q4,2015-12-08
3,Q4,2015-12-08
4,Q4,2015-12-08
...,...,...
16942,Q4,2015-12-04
16943,Q4,2015-12-04
16944,Q4,2015-12-04
16945,Q4,2015-12-05


In [130]:
    #df.loc[:, ['Account_LifeSpan', 'Account_Created_Date']]
    #tst = train.nsmallest(100, 'Account_LifeSpan')
    #tst.drop_duplicates(subset=['Opportunity_ID']).Stage.value_counts()

In [131]:
train.drop_duplicates(subset=['Opportunity_ID']).Region.value_counts()

EMEA           3237
Americas       2452
APAC           2079
Japan          1885
Middle East     138
Name: Region, dtype: int64

In [132]:
train[train.Region=='APAC'].drop_duplicates(subset=['Opportunity_ID']).Territory.value_counts()

India              678
Australia          577
Singapore          293
Thailand           147
Indonesia          138
Philippines         81
China (PRC)         52
Vietnam             41
Taiwan              28
New Zealand         23
South Korea          9
Cambodia             4
Malaysia             4
South East Asia      2
Burma                1
Solomon Islands      1
Name: Territory, dtype: int64

In [133]:
def hasService(x):
    
    for t in x:
        if t==0:
            return True
    
    return False

In [134]:
train['HasService'] = train.groupby('Opportunity_ID')['TRF'].transform(hasService)
train.HasService.value_counts()

True     12559
False     4324
Name: HasService, dtype: int64

In [135]:
train.groupby('HasService').agg({'Stage':'value_counts'})

Unnamed: 0_level_0,Unnamed: 1_level_0,Stage
HasService,Stage,Unnamed: 2_level_1
False,Closed Lost,3309
False,Closed Won,1015
True,Closed Won,8518
True,Closed Lost,4041


In [160]:
train['Total_TRF'] = train.groupby('Opportunity_ID')['TRF'].transform('sum')
train['Total_Amount_Sum'] = train.groupby('Opportunity_ID')['Total_Amount'].transform('sum')
aux = train.drop_duplicates(subset=['Opportunity_ID'])
aux[aux.Total_TRF>0].Stage.value_counts()

Closed Lost    2868
Closed Won      874
Name: Stage, dtype: int64

In [165]:
train['Price_Per_TRF'] = (train.Total_Amount_Sum/train.Total_TRF).replace([np.inf, -np.inf], 0)
train['TRF_Free'] = train.Total_TRF==0
tr = train[(train.HasService)]
tr.drop_duplicates(subset=['Opportunity_ID']).Stage.value_counts()

Closed Won     4357
Closed Lost    2100
Name: Stage, dtype: int64

In [170]:
train.groupby(['Opportunity_ID', 'Opportunity_Owner']).agg({'Total_Amount_Sum':'mean'})#['Total_Amount_Sum_USD'].transform

Unnamed: 0_level_0,Unnamed: 1_level_0,Total_Amount_Sum
Opportunity_ID,Opportunity_Owner,Unnamed: 2_level_1
0,Person_Name_18,5272800.00
1,Person_Name_20,48230.00
2,Person_Name_8,83865.60
3,Person_Name_8,7421881.50
4,Person_Name_8,13357192.50
...,...,...
12799,Person_Name_13,401700.00
12800,Person_Name_13,21332500.00
12801,Person_Name_13,299715.00
12802,Person_Name_3,2346796.88


In [1]:
letraANum = {'A':1, \
             'B':2, \
             'C':3, \
             'D':4, \
             'E':5, \
             'F':6, \
             'G':7, \
             'H':8, \
             'I':9, \
             'J':10, \
             'K':11 }

numALetra = {1:'A', \
             2:'B', \
             3:'C', \
             4:'D', \
             5:'E', \
             6:'F', \
             7:'G', \
             8:'H', \
             9:'I', \
             10:'J', \
             11:'K' }

lim = {'A': ['B', 'D', 'F'], \
       'B': ['A', 'C', 'D'], \
       'C': ['B', 'D', 'E'], \
       'D': ['A', 'B', 'C', 'F', 'G', 'E'], \
       'E': ['C', 'D', 'G', 'H', 'I'], \
       'F': ['A', 'D', 'G', 'K'], \
       'G': ['F', 'D', 'E', 'H'], \
       'H': ['G', 'E', 'I', 'J'], \
       'I': ['E', 'H', 'J'], \
       'J': ['H', 'I'], \
       'K': ['F'] }

poblacion = {'A': 15, \
             'B': 20, \
             'C': 38, \
             'D': 12, \
             'E': 22, \
             'F': 31, \
             'G': 300, \
             'H': 62, \
             'I': 15, \
             'J': 6, \
             'K': 30 }

distritos = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K']
poblacion_total = 0

for distrito in distritos:
    poblacion_total += poblacion[distrito]
    
costo_max = poblacion_total * 60

In [11]:
def es_solucion(construidos):
    total = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J','K']
    dups = []
    
    for dist in total:
        if dist not in cubiertos(construidos):
            return False
        
    for dist in construidos:
        if dist in dups:
            return False
        dups.append(dist)
        
    return True

def cubiertos(construidos):
    final = []
    for dist in construidos:
        final += lim[dist] + [dist]
    return final

def funcional(construidos):
    pob = 0
    
    for dist in construidos:
        pob += poblacion[dist]
    
    return costo_max - pob*45

In [14]:
#start = ['D', 'E', 'F', 'G']
start = ['K', 'K', 'K', 'K']

i=0
j=0
k=0
l=0

while not es_solucion(start):

    if i > 10:
        i=0
        j+=1
        
        idx_j = (letraANum[start[2]] + 1)%12
        if(idx_j==0):
            idx_j+= 1

        start[2] = numALetra[idx_j]
        
    if j > 10:
        j=0
        k+=1
        
        idx_k = (letraANum[start[1]] + 1)%12
        if(idx_k==0):
            idx_k+= 1

        start[1] = numALetra[idx_k]
        
    if k > 10:
        k=0
        l+=1
        
        idx_l = (letraANum[start[0]] + 1)%12
        
        if(idx_l==0):
            idx_l+= 1
            
        start[0] = numALetra[idx_l]
        
    idx_i = (letraANum[start[3]] + 1)%12
    if(idx_i==0):
        idx_i+= 1
        
    start[3] = numALetra[idx_i]
    
    i+=1
    
    if l > 10:
        print('fin')
        break

In [15]:
print(start)
cubiertos(start)

['K', 'A', 'B', 'H']


['F', 'K', 'B', 'D', 'F', 'A', 'A', 'C', 'D', 'B', 'G', 'E', 'I', 'J', 'H']

In [16]:
funcional(start)

27345