In [3]:
import pandas as pd
import numpy as np
import matplotlib
import os

In [221]:
df_train = pd.read_csv("../data/train.csv", "|")
df_train_sorted = df_train.sort_values(by=['well id', 'depth, m'])
df_train_sorted.to_csv('../data/train_sorted.csv',index=False, sep='|')
df_train_sorted.to_json('../data/train_sorted.json')

In [359]:
df_train_clean = df_train.drop_duplicates(subset=['well id','depth, m','bk','GZ1','GZ2','GZ3','GZ4','GZ5','GZ7','DGK','NKTD','NKTM','NKTR','ALPS'], keep=False)
df_train_clean.to_csv('../data/train_clean.csv',index=False, sep='|')
df_train_clean.to_json('../data/train_clean.json')
df_train_clean_sorted = df_train_clean.sort_values(by=['well id', 'depth, m'])
df_train_clean_sorted.to_csv('../data/train_clean_sorted.csv',index=False, sep='|')
df_train_clean_sorted.to_json('../data/train_clean_sorted.json')

In [21]:
df_test = pd.read_csv("/Users/zamaletdinovaaa/mipt_cet_hack/Silicondzor/data/test.csv", "|")
df_test.to_json('../data/test.json')
df_test_sorted = df_test.sort_values(by=['well id', 'depth, m'])
df_test_sorted.to_json('../data/test_sorted.json')

In [237]:
_PNB = 4150
_PHI = 0.7
_RHO = 860
_S = 100
_H_DELTA = 0.1
_EXPENCES_BK = 245
_EXPENCES_GZ = 205
_EXPENCES_DGK = 130
_EXPENCES_NKT = 205
_EXPENCES_ALPS = 115
_EXPENCES_DICT = {'bk':_EXPENCES_BK, 
                 'GZ1':_EXPENCES_GZ,
                  'GZ2':_EXPENCES_GZ,
                  'GZ3':_EXPENCES_GZ,
                  'GZ4':_EXPENCES_GZ,
                  'GZ5':_EXPENCES_GZ,
                  'GZ6':_EXPENCES_GZ,
                  'GZ7':_EXPENCES_GZ,
                  'DGK':_EXPENCES_DGK,
                  'NKTD':_EXPENCES_NKT,
                  'NKTM':_EXPENCES_NKT,
                  'NKTR':_EXPENCES_NKT,
                  'ALPS':_EXPENCES_ALPS
                 }
_EXPENCES_COLUMNS_ALL = list(_EXPENCES_DICT.keys())

In [241]:
def analysis_columns(df):
    return list(pd.Index(_EXPENCES_COLUMNS_ALL).intersection(df.columns))
    
def add_expences(df):
    df_temp = df[analysis_columns(df)]
    df_temp = df_temp.notnull().astype('int')
    
    df['expences'] = (df_temp * [_EXPENCES_DICT.get(key) for key in analysis_columns(df)]).sum(axis=1)
    return df

In [312]:
def form_result_df(df):
    df_grouped = df.groupby(['well id'])
    columns = ['well id', 'total_cnt', 'oil_cnt', 'value', 'expenses', 'square', 'mass', 'price']

    df_dashbord_show=None
    df_dashbord_show = pd.DataFrame(columns=columns)
    
    for well_id, df_well in df_grouped:
        total_cnt = df_well.shape[0]
        oil_cnt = df_well[df_well['goal']==1]['well id'].count()
        h = _H_DELTA * oil_cnt
        expences = df_well['expences'].sum()
        square = oil_cnt*_S
        mass = h* square * _RHO
        value = _PNB*_PHI*h*_S*_RHO - expences
        price = mass * _PNB
        row_dict = [{'price':price,'mass':mass,'square':square,'well id':well_id, 'total_cnt':total_cnt, 'oil_cnt':oil_cnt, 'value':value, 'expenses':expences}]
        df_dashbord_show_inc = pd.DataFrame(row_dict)
        df_dashbord_show = df_dashbord_show.append(df_dashbord_show_inc)
    return df_dashbord_show

In [284]:
df_train_expences = add_expences(df_train)   
df_result_example = df_train_expences.iloc[:50]
df_result_example.to_json('../data/result_example_better.json',orient='split')

In [307]:
df_result_total_example=df_result_total_example.reset_index(drop=True)
df_result_total_example = form_result_df(df_result_example)

In [313]:
df_test_pred_full = pd.merge(df_test, df_test_pred, on='id')
df_test_pred_full_expences = add_expences(df_test_pred_full)
df_test_pred_full_expences.to_csv('../data/df_test_pred_full_expences.csv',index=False, sep='|')
df_test_pred_full_total = form_result_df(df_test_pred_full)
df_test_pred_full_total.to_csv('../data/df_test_pred_full_total.csv',index=False, sep='|')

In [356]:
df_grouped = df_test_pred_full.groupby(['well id'])

for well_id, df_well in df_grouped:
    category = range(500, 3000, 500)
    oil = [0]*(len(category)+1)
    oil[0] = well_id
    for i, v in enumerate(category):
        oil[i+1] += df_well[(df_well['depth, m']>=v) & (df_well['depth, m']<=v+500)]['well id'].sum()
    print(oil)
        
        
        
          
          
#df_test[df_test.duplicated(['well id','depth, m','bk','GZ1','GZ2','GZ3','GZ4','GZ5','GZ7','DGK','NKTD','NKTM','NKTR','ALPS',], keep=False)]

[47, 0, 0, 0, 19035, 44885]
[95, 0, 0, 0, 0, 113905]
[126, 90216, 715680, 713916, 584136, 117810]
[164, 133332, 941032, 943492, 645176, 0]
[176, 0, 0, 0, 206624, 0]


In [314]:
df_test_pred = pd.read_csv("../data/predicted_test.csv", ",")

In [302]:
(df_test_pred.shape, df_test.shape)

((37604, 3), (37604, 15))

In [322]:
print(f"mass {df_test_pred_full_total['mass'].sum()}")

mass 154291619600.0


In [None]:
for row in df_train

In [324]:
df_test_pred_full
pd.set_option('display.float_format', '{:.2f}'.format)

In [None]:
dd = [[0, 67], [1, 88], [2, 77],
          [3, 93], [4, 85], [5, 91],
          [6, 71], [7, 78], [8, 93],
          [9, 80], [10, 82],[0, 75],
          [5, 80], [3, 90], [1, 72],
          [5, 75], [6, 68], [7, 98],
          [3, 82], [9, 94], [2, 79],
          [2, 95], [2, 86], [3, 67],
          [4, 60], [2, 80], [6, 92],
          [2, 81], [8, 79], [9, 83],
          [3, 75], [1, 80], [3, 71],
          [3, 89], [4, 92], [5, 85],
          [6, 92], [7, 78], [6, 95],
          [3, 81], [0, 64], [4, 85],
          [2, 83], [3, 96], [4, 77],
          [5, 89], [4, 89], [7, 84],
          [4, 92], [9, 98]]

In [358]:
df_test_pred_full_total

Unnamed: 0,expenses,mass,oil_cnt,price,square,total_cnt,value,well id
0,3175600,897229400.0,323,3723502010000.0,32300,1360,8066333400.0,47
0,2799550,248540000.0,170,1031441000000.0,17000,1199,4244310450.0,95
0,41169250,65369021400.0,2757,271281438810000.0,275700,17633,68836961750.0,126
0,36182310,87404758400.0,3188,362729747360000.0,318800,16238,79609621690.0,164
0,2741290,372070400.0,208,1544092160000.0,20800,1174,5193722710.0,176


In [262]:
df_test_pred_full_total['oil_cnt'].sum(), df_test_pred['goal'].sum()

(6646, 6646)

In [258]:
g1 = df_test_pred_full_expences[['well id', 'depth, m', 'goal', 'lith']] 

g1.to_csv('../data/df_test_pred_full_expences_1g.csv',index=False, sep='|')


In [311]:
df_test_pred_full_total['price'] - df_test_pred_full_total['expenses']

0    1.15247e+10
0     6.0645e+09
0    9.83562e+10
0    1.13744e+11
0    7.42078e+09
dtype: object

In [298]:
df_test_pred_full_total['value'] - df_test_pred_full_total['price']

0    2.525725e+11
0    6.795096e+10
0    1.892078e+13
0    2.531140e+13
0    1.028872e+11
dtype: float64

In [335]:
list(range(1000, 3500, 500))

[1000, 1500, 2000, 2500, 3000]