In [3]:
import pandas as pd
import numpy as np
import matplotlib
import os

In [22]:
df_train = pd.read_csv("../data/train.csv", "|")
df_train_sorted = df_train.sort_values(by=['well id', 'depth, m'])
df_train_sorted.to_csv('../data/train_sorted.csv',index=False, sep='|')
df_train_sorted.to_json('../data/train_sorted.json')

In [20]:
df_train_clean = df_train.drop_duplicates(subset=['well id','depth, m','bk','GZ1','GZ2','GZ3','GZ4','GZ5','GZ7','DGK','NKTD','NKTM','NKTR','ALPS'], keep=False)
df_train_clean.to_csv('../data/train_clean.csv',index=False, sep='|')
df_train_clean.to_json('../data/train_clean.json')
df_train_clean_sorted = df_train_clean.sort_values(by=['well id', 'depth, m'])
df_train_clean_sorted.to_csv('../data/train_clean_sorted.csv',index=False, sep='|')
df_train_clean_sorted.to_json('../data/train_clean_sorted.json')

In [21]:
df_test = pd.read_csv("/Users/zamaletdinovaaa/mipt_cet_hack/Silicondzor/data/test.csv", "|")
df_test.to_json('../data/test.json')
df_test_sorted = df_test.sort_values(by=['well id', 'depth, m'])
df_test_sorted.to_json('../data/test_sorted.json')

In [24]:
_PNB = 4150
_PHI = 0.7
_RHO = 860
_S = 100
_H_DELTA = 0.1
_EXPENCES_BK = 2450
_EXPENCES_GZ = 2050
_EXPENCES_DGK = 1300
_EXPENCES_NKT = 2050
_EXPENCES_ALPS = 1150
_EXPENCES_DICT = {'bk':_EXPENCES_BK, 
                 'GZ1':_EXPENCES_GZ,
                  'GZ2':_EXPENCES_GZ,
                  'GZ3':_EXPENCES_GZ,
                  'GZ4':_EXPENCES_GZ,
                  'GZ5':_EXPENCES_GZ,
                  'GZ6':_EXPENCES_GZ,
                  'GZ7':_EXPENCES_GZ,
                  'DKG':_EXPENCES_DGK,
                  'NKTD':_EXPENCES_NKT,
                  'NKTM':_EXPENCES_NKT,
                  'NKTR':_EXPENCES_NKT,
                  'ALPS':_EXPENCES_ALPS
                 }
_EXPENCES_COLUMNS_ALL = list(_EXPENCES_DICT.keys())

In [25]:
def analysis_columns(df):
    return list(pd.Index(_EXPENCES_COLUMNS_ALL).intersection(df.columns))
    
def add_expences(df):
    df['expences'] = (df[analysis_columns(df)].fillna(0) * [_EXPENCES_DICT.get(key) for key in analysis_columns(df)]).sum(axis=1)
    return df

In [26]:
def form_result_df(df):
    df_grouped = df.groupby(['well id'])
    columns = ['well id', 'total_cnt', 'oil_cnt', 'value', 'expenses', 'square', 'mass', 'price']

    df_dashbord_show = pd.DataFrame(columns=columns)
    
    for well_id, df_well in df_grouped:
        total_cnt = df_well.shape[0]
        oil_cnt = df_well[df_well['goal']==1]['well id'].count()
        h = _H_DELTA * oil_cnt
        expences = df_well['expences'].sum()
        square = oil_cnt*_S
        mass = _H_DELTA * square * _RHO
        value = _PNB*_PHI*h*_S*_RHO - expences
        price = mass * _PNB
        row_dict = [{'price':price,'mass':mass,'square':square,'well id':well_id, 'total_cnt':total_cnt, 'oil_cnt':oil_cnt, 'value':value, 'expenses':expences}]
        df_dashbord_show_inc = pd.DataFrame(row_dict)
        df_dashbord_show = df_dashbord_show.append(df_dashbord_show_inc)
    return df_dashbord_show

In [27]:
df_train_expences = add_expences(df_train)   
df_result_example = df_train_expences.iloc[:50]
df_result_example.to_json('../data/result_example.json')

In [28]:
df_test_expences = add_expences(df_train)   

In [44]:
df_result_example.head()

Unnamed: 0,well id,"depth, m",bk,GZ1,GZ2,GZ3,GZ4,GZ5,GZ7,DGK,NKTD,NKTM,NKTR,ALPS,lith,goal,expences
0,12.0,3042.463,0.455779,0.774,0.774,0.332846,0.476545,0.774,0.244898,0.123651,0.774,0.799167,0.174639,0.727789,NK,0.0,12458.009407
1,33.0,2412.8305,,0.157153,0.755472,0.453216,0.715147,,0.523432,0.388729,0.519233,0.799167,0.563573,0.111216,ARGILLIT,0.0,9325.002552
2,33.0,2522.5575,,0.172235,0.941742,0.525711,0.6686,,0.588924,0.515214,0.527139,0.799167,0.551152,0.172359,ARGILLIT,0.0,9986.288876
3,49.0,2396.3625,0.381248,0.246838,0.116913,0.732654,0.151299,0.165219,0.656955,0.445917,0.575175,0.799167,0.429433,0.253563,ARGILLIT,0.0,9166.645598
4,49.0,2399.0475,0.356867,0.226241,0.155473,0.6133,0.173949,0.297194,0.773218,0.392743,0.575725,0.799167,0.447169,0.959633,ARGILLIT,0.0,10303.846759


In [48]:
df_result_total_example.head()

Unnamed: 0,cost,expenses,mass,oil_cnt,square,total_cnt,value,well id
0,0.0,37225.132443,0.0,0,0,3,-37225.13,12.0
0,35690000.0,11010.280635,8600.0,1,100,1,24971990.0,16.0
0,0.0,29042.365634,0.0,0,0,3,-29042.37,33.0
0,35690000.0,129213.810595,8600.0,1,100,13,24853790.0,49.0
0,0.0,12460.11324,0.0,0,0,1,-12460.11,77.0


In [49]:
df_result_total_example=df_result_total_example.reset_index(drop=True)

In [47]:
df_result_total_example = form_result_df(df_result_example)

In [51]:
df_result_total_example.to_json('../data/result_total_example.json')

In [11]:
df_grouped = df_test.groupby(['well id'])

for well_id, df_well in df_grouped:
    print(f'well_id {well_id} nunique - {df_well["depth, m"].nunique()} shape {df_well.shape}')
          
          
df_test[df_test.duplicated(['well id','depth, m','bk','GZ1','GZ2','GZ3','GZ4','GZ5','GZ7','DGK','NKTD','NKTM','NKTR','ALPS',], keep=False)]

well_id 47 nunique - 1297 shape (1360, 15)
well_id 95 nunique - 1162 shape (1199, 15)
well_id 126 nunique - 17382 shape (17633, 15)
well_id 164 nunique - 15826 shape (16238, 15)
well_id 176 nunique - 1136 shape (1174, 15)


Unnamed: 0,id,well id,"depth, m",bk,GZ1,GZ2,GZ3,GZ4,GZ5,GZ7,DGK,NKTD,NKTM,NKTR,ALPS
30,31,47,2468.231,0.449133,0.376967,0.127989,0.531776,0.253316,0.911877,0.417324,0.769427,0.381311,0.558993,0.422383,0.774000
31,32,47,2468.231,0.449133,0.376967,0.127989,0.531776,0.253316,0.911877,0.417324,0.769427,0.381311,0.558993,0.422383,0.774000
39,40,47,2468.947,0.458829,0.316576,0.117564,0.435454,0.354642,0.893757,0.369689,0.515214,0.381458,0.543162,0.411672,0.774000
40,41,47,2468.947,0.458829,0.316576,0.117564,0.435454,0.354642,0.893757,0.369689,0.515214,0.381458,0.543162,0.411672,0.774000
60,61,47,2470.737,0.367247,0.276679,0.119591,0.552777,0.114990,0.223684,0.645786,0.366495,0.386175,0.563918,0.425497,0.368554
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37520,37521,176,2165.005,0.399551,0.262832,0.127844,0.782174,0.141857,0.368494,0.852176,0.494653,0.347186,0.498485,0.424384,0.162172
37528,37529,176,2165.721,0.423699,0.247487,0.131655,0.675283,0.125893,0.371813,0.841864,0.412178,0.363219,0.513265,0.496400,0.132560
37529,37530,176,2165.721,0.423699,0.247487,0.131655,0.675283,0.125893,0.371813,0.841864,0.412178,0.363219,0.513265,0.496400,0.132560
37565,37566,176,2168.943,0.397237,0.279274,0.154195,0.733595,0.117446,0.357766,0.937225,0.344939,0.393485,0.527332,0.379199,0.195392


In [204]:
df2['well id'].isna().count()

21881

In [190]:
df1=df

In [200]:
df2=df1[df1.duplicated(['well id','depth, m','bk','GZ1','GZ2','GZ3','GZ4','GZ5','GZ7','DGK','NKTD','NKTM','NKTR','ALPS'], keep=False)]




In [None]:
df.drop_duplicates(subset=['well id','depth, m','bk','GZ1','GZ2','GZ3','GZ4','GZ5','GZ7','DGK','NKTD','NKTM','NKTR','ALPS'], keep=False)