In [97]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil import rrule
import ahmedsabri
from ahmedsabri import *
%matplotlib inline

In [98]:
start_date = datetime(2022, 9, 1) # year,month,day
end_date = datetime(2022, 12, 31) # year, month, day

In [99]:
def prepare(data):
    df=data.copy()
    df=df.transpose()
    df.columns=df.iloc[0:3].fillna('').astype(str).apply(' '.join).str.strip()
    df=df.iloc[3:]
    #df.columns=df.columns.str.replace(" ","_")
    #df.columns=df.columns.str.lower()
    #df.columns=df.columns.str.replace("-","_")
    df.reset_index(drop=True, inplace=True)
    #df.dropna(how="any", thresh=df.shape[0]*0.99,inplace=True)
    #df.dropna(axis="columns", how="any", thresh=df.shape[0]*0.99, inplace=True)
    #df=df.replace('No Data',np.nan)
    #df=df.replace('No Lab Data',np.nan)
    #df=df.replace('NaT',np.nan)
    #df=df.replace('-',np.nan)
    df=df.replace('Nil',0)
    return df

In [100]:
def to_float(df):    
    for column in df.columns:
        try:
            df[column]=df[column].astype(float)
        except:
            continue
    return df

In [101]:
def to_numbers(df):
    '''
    converting dataframe to numeric values, all other non-numbers will converted to Nan
    '''
    for column in df.columns:
        df[column]=pd.to_numeric(df[column],errors='coerce')
    return df

In [102]:
def plotting(df):
    for i in df.columns:
        try:
            fig, ax = plt.subplots(figsize=(7,3))
            ax.plot(df[i])
            ax.set_ylabel(i)
            ax.set_xlabel('days')
            plt.xticks(rotation=45)
            plt.show()
        except:
            pass   

In [103]:
def date_plot(df,date):
    for column in df.columns:
        try:
            fig,ax=plt.subplots(figsize=(10,5))
            sns.scatterplot(x=date['date'],y=df[column],ax=ax)
            plt.xticks(rotation=45)
            plt.show()
        except:
            pass

In [104]:
def outlier_columns(df,a=4):
    z_scores = stats.zscore(df[df.describe(exclude=[datetime]).columns])
    z_scores.fillna(0,inplace=True)
    abs_z_scores = np.abs(z_scores)
    (abs_z_scores>a).any(axis=0)
    outliers_columns=abs_z_scores.columns[(abs_z_scores>a).any(axis=0)]
    return df[outliers_columns]

In [105]:
def lab_assurance(df):
    for i in range (1,411):
        if type(df.iloc[i,0])!=str:
            df.iloc[i,0]=df.iloc[i-1,0]
    #df.dropna(axis=0,how='any',thresh=6,inplace=True)
    #df.reset_index(inplace=True,drop=True)

In [106]:
# making dictionary which keys is month_year and values are the dataframes
df_NHT_macro_dict={}
df_Plat_macro_dict={}
df_lab_dict={}
for dt in rrule.rrule(rrule.MONTHLY, dtstart=start_date, until=end_date):
    os.chdir(os.path.join(r'\\192.168.21.16\Area 02A sharing\Spread Sheets',str(dt.year)))
    month = '{:02d}'.format(dt.month)
    year=str(dt.year)[-2:]
    sheet_name=month+' '+year+' '+' CCR NHT.xls'
    #df='df'+'_'+str(dt.month)+'_'+str(dt.year)[-2:]
    df_NHT_macro_dict[month+'_'+year]=pd.read_excel(sheet_name ,header = None,sheet_name='u-2,3',usecols='a:ah',nrows=157)
    df_Plat_macro_dict[month+'_'+year]=pd.read_excel(sheet_name,header = None,sheet_name="u-04",usecols="a:ah",nrows=306)
    df_lab_dict[month+'_'+year]=pd.read_excel(sheet_name,header = None,sheet_name="lab data input",usecols="a:ah",nrows=411)

## preparing NHT macro

In [107]:
for key in df_NHT_macro_dict.keys():
    df_NHT_macro_dict[key]=prepare(df_NHT_macro_dict[key])
    df_NHT_macro_dict[key]['NHT']=pd.to_datetime(df_NHT_macro_dict[key]['NHT'],errors='raise')
    #df_NHT_macro_dict[key]= df_NHT_macro_dict[key].T.drop_duplicates().T
    if df_NHT_macro_dict[key].columns.duplicated().sum()>0:
        df_NHT_macro_dict[key] = df_NHT_macro_dict[key].loc[:,~df_NHT_macro_dict[key].columns.duplicated()].copy() 
    #df_NHT_macro_dict[key].iloc[:,1:]=to_numbers(df_NHT_macro_dict[key][df_NHT_macro_dict[key].columns[1:]])
    #df_NHT_macro_dict[key].dropna(axis="columns", how="any", thresh=df_NHT_macro_dict[key].shape[0]*0.98, inplace=True)
    #df_NHT_macro_dict[key].dropna(how="any", thresh=df_NHT_macro_dict[key].shape[0]*0.98, inplace=True)    

In [108]:
#https://www.geeksforgeeks.org/merge-two-dataframes-with-same-column-names/
df_NHT_macro=pd.concat(df_NHT_macro_dict.values(),axis=0,ignore_index=True)

In [109]:
df_NHT_macro

Unnamed: 0,NHT,DCS READINGS,Tags Description Units,02 - Naphtha Hydrotreating Unit,01FIC156 Straight run naphtha from unit1 m3/hr,01TI034 Straight run naphtha from unit1 °C,08FI055 Hydrocracked naphtha from unit8 m3/hr,08TI330 Hydrocracked naphtha from unit8 °C,11FIC060 Coker naphtha from unit11 m3/hr,11TI158 Coker naphtha from unit11 °C,...,02-PIC-021 FUG pressure to 02H01 bar(g),02-PI-065 02H01 Draft mmH2O,03-PI-045 03H01 Draft mmH2O,03-FI-102 Tk 08 sweet naphtha to splitter m3/hr,"03-HV-005 02-H-01 , 03- H-01 dumper opening %",03-TI-015 Charge heater bridge wall temp.,03-TXI-017 B 03-H-01 Skin,51-LI-018 TK08 level mm,51-LI-019 TK09 level mm,Unnamed: 21
0,2022-09-01,,,,80.015497,122.8592,43.21824,92.964183,18.283027,120.59863,...,0.922643,-15.170298,-11.982064,-0.9612,42.883056,700.858022,298.111894,8536.28894,12275.369466,
1,2022-09-02,,,,80.017882,122.801512,42.192187,92.969271,19.037102,119.679827,...,0.918095,-15.519479,-12.704383,-1.043187,44.540417,698.37057,294.968427,8543.090739,13185.194092,
2,2022-09-03,,,,105.599249,122.405444,41.502576,93.095237,17.989399,118.593135,...,0.81376,-13.644421,-10.781325,-1.066405,40.741667,702.775085,297.747627,8554.184814,14006.862834,
3,2022-09-04,,,,101.824333,121.225688,41.781809,93.065323,18.378125,119.742911,...,1.052393,-13.29199,-9.939367,2.131804,40,702.837056,294.78944,8526.189982,14875.502116,
4,2022-09-05,,,,96.105673,118.9104,41.428627,93.074888,18.354088,119.176163,...,1.089726,-13.170462,-9.046728,0.139124,40.772917,700.422312,294.111308,8406.395549,15873.230591,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,2022-12-27,,,,84.990911,116.645178,42.786861,88.908222,18.905978,116.798603,...,1.433618,-13.575067,-4.98023,-1.816175,46,677.566511,288.746988,10145.5059,11741.85734,
120,2022-12-28,,,,84.984877,116.722267,43.238135,89.259055,18.643396,116.737552,...,1.445534,-13.050281,-4.632741,-1.816873,46,686.204325,292.923996,10289.907715,11825.676473,
121,2022-12-29,,,,85.006165,117.2578,43.763889,89.539886,18.815272,117.678063,...,1.424779,-12.888574,-4.301465,-1.816412,45.559028,686.863711,292.47968,10423.116699,12000.566284,
122,2022-12-30,,,,84.989234,118.135467,42.80018,90.819901,17.673872,117.158798,...,1.447959,-12.024121,-3.785205,-1.816647,43.543056,689.611023,292.354881,10560.969767,12199.796834,


## preparing platforming Macro

In [110]:
for key in df_Plat_macro_dict.keys():
    df_Plat_macro_dict[key]=prepare(df_Plat_macro_dict[key])
    df_Plat_macro_dict[key]['DCS READINGS']=pd.to_datetime(df_Plat_macro_dict[key]['DCS READINGS'],errors='raise')
    #df_Plat_macro_dict[key]= df_Plat_macro_dict[key].T.drop_duplicates().T
    if df_Plat_macro_dict[key].columns.duplicated().sum()>0:
        df_Plat_macro_dict[key] = df_Plat_macro_dict[key].loc[:,~df_Plat_macro_dict[key].columns.duplicated()].copy()     
    #df_Plat_macro_dict[key].iloc[:,1:]=to_numbers(df_Plat_macro_dict[key][df_Plat_macro_dict[key].columns[1:]])
    #df_Plat_macro_dict[key].dropna(axis="columns", how="any", thresh=df_Plat_macro_dict[key].shape[0]*0.99, inplace=True)
    #df_Plat_macro_dict[key].dropna(how="any", thresh=df_Plat_macro_dict[key].shape[0]*0.99, inplace=True)    

In [111]:
#https://www.geeksforgeeks.org/merge-two-dataframes-with-same-column-names/
df_Plat_macro=pd.concat(df_Plat_macro_dict.values(),axis=0,ignore_index=True)

In [112]:
df_Plat_macro.to_clipboard()

In [113]:
df_Plat_macro

Unnamed: 0,CCR PLATFORMING,DCS READINGS,Tags Description,"TI009 Rx#1 (04R01) outlet, temp","TI010 Rx#2 (04R02) outlet, temp","TI011 Rx#3 (04R03) outlet, temp","TI013 Rx#4 (04R04) outlet, temp",PI042 Reactor #4 outlet Pressure,"FIC001A Cat Collector Purge Gas, flow","TIC014 Cat Collector PG SP, temp",...,04-PDV-532 Lock Hopper Make up %,04-XV-555 Lock Hopper Make up %,04-PDV-525a Nitrogen to NSD %,Reactor effluemt out of packinox barg 04PI605,liquid HC feed to packinox barg 04PI606,recycle gas to packinox barg 04PI607,combined feed out of packinox barg 04PI608,combined feed steam pressure drop barg 04PI607-04PI608,effluent steam pressure drop barg 04PI042-04PI605,packinox skin temperature c 04TXI607
0,,2022-09-01,,403.664866,449.420067,473.896933,488.0254,3.169869,419.963947,150.385384,...,21.023167,14.743022,14.652693,2.648153,6.111168,4.475222,4.265268,0.209954,0.521734,398.6904
1,,2022-09-02,,404.520535,449.833734,474.075267,487.920666,3.170869,419.973413,150.386446,...,22.293501,14.603963,14.601692,2.648685,6.117735,4.478337,4.267969,0.210363,0.52219,397.66013
2,,2022-09-03,,404.568666,450.100802,474.794263,488.575798,3.172182,420.002828,150.391617,...,22.176838,14.741292,14.780708,2.649331,6.12003,4.483826,4.2727,0.211123,0.522828,398.273532
3,,2022-09-04,,402.903132,449.057732,474.317596,488.512199,3.170403,420.019358,150.395699,...,21.230224,14.679903,14.817503,2.648632,6.117793,4.478844,4.268173,0.210659,0.521756,399.011996
4,,2022-09-05,,405.2304,450.017337,474.362665,488.582401,3.175984,420.01076,150.39596,...,19.136942,14.614978,14.54226,2.65053,6.141641,4.490428,4.278352,0.212063,0.525463,399.14186
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
119,,2022-12-27,,410.0944,453.287799,476.716,491.2488,3.164244,420.00242,150.015695,...,22.710063,14.720603,42.294006,2.62108,6.396556,4.475604,4.262687,0.21292,0.543154,398.808067
120,,2022-12-28,,411.9084,454.645599,477.804469,491.478734,3.20836,419.94538,149.972365,...,10.452331,7.386633,22.011246,2.671613,6.432276,4.513549,4.300298,0.213243,0.536736,398.164263
121,,2022-12-29,,413.657,456.184399,479.806332,493.233467,3.163193,420.028954,150.019828,...,21.081445,13.282043,48.425753,2.625199,6.388609,4.470834,4.257198,0.21364,0.53798,398.97167
122,,2022-12-30,,414.129667,454.235068,478.836066,493.415003,3.171819,419.971258,149.988941,...,23.836027,14.692944,44.595477,2.626847,6.410259,4.488812,4.274822,0.213983,0.544989,399.6224


In [114]:
#pd.DataFrame(df_Plat_macro_dict['06_22'].columns.tolist())[1:].to_clipboard(header=False, index=False)

In [115]:
#for column in df_Plat_macro_dict['04_22'].columns:
    #if column not in df_Plat_macro_dict['06_22'].columns:
        #print(column)

In [116]:
'''
cols_4=[]
cols_6=[]
for column in df_Plat_macro_dict['04_22'].columns:
    cols_4.append(column)
for column in df_Plat_macro_dict['06_22'].columns:
    cols_6.append(column)

for k in zip(cols_4,cols_6):
    print(k)
    print('------------')
'''


"\ncols_4=[]\ncols_6=[]\nfor column in df_Plat_macro_dict['04_22'].columns:\n    cols_4.append(column)\nfor column in df_Plat_macro_dict['06_22'].columns:\n    cols_6.append(column)\n\nfor k in zip(cols_4,cols_6):\n    print(k)\n    print('------------')\n"

## preparing Lab data

In [117]:
for key in df_lab_dict.keys():
    try:
        lab_assurance(df_lab_dict[key])
    except:
        continue

In [118]:
for key in df_lab_dict.keys():
    df_lab_dict[key]=prepare(df_lab_dict[key])
    df_lab_dict[key]['Date Design']=pd.to_datetime(df_lab_dict[key]['Date Design'],errors='raise')
    #df_lab_dict[key]= df_lab_dict[key].T.drop_duplicates().T # remove duplicated columns
    #if df_lab_dict[key].columns.duplicated().sum()>0:
        #df_lab_dict[key] = df_lab_dict[key].loc[:,~df_lab_dict[key].columns.duplicated()].copy() # remove deuplicated columns without the same values
    #df_lab_dict[key].iloc[:,1:]=to_numbers(df_lab_dict[key][df_lab_dict[key].columns[1:]])
    #df_lab_dict[key].dropna(axis="columns", how="any", thresh=df_lab_dict[key].shape[0]*0.98, inplace=True)
    #df_lab_dict[key].dropna(how="any", thresh=df_lab_dict[key].shape[0]*0.98, inplace=True)
    #print(f'Key({key} is done')

In [121]:
df_lab=pd.concat(df_lab_dict.values(),axis=0,ignore_index=True)

In [122]:
df_lab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124 entries, 0 to 123
Columns: 411 entries, Date Design to DCU Naptha Density 15C
dtypes: datetime64[ns](1), object(410)
memory usage: 398.3+ KB


# MERGING

In [123]:
df_1=pd.merge(df_lab,df_NHT_macro,'outer',left_on='Date Design',right_on='NHT')

In [124]:
df_1.shape

(128, 564)

In [125]:
df_1.drop_duplicates().shape

(128, 564)

In [128]:
df_1.columns

Index(['Date Design', 'NHT Feed 02 SC 01 Density 15C',
       'NHT Feed 02 SC 01 Sulfur', 'NHT Feed 02 SC 01 Sulfur',
       'NHT Feed 02 SC 01 Color', 'NHT Feed 02 SC 01 Dien value',
       'NHT Feed 02 SC 01 IBP', 'NHT Feed 02 SC 01 0.05',
       'NHT Feed 02 SC 01 0.1', 'NHT Feed 02 SC 01 0.3',
       ...
       '02-PIC-021 FUG pressure to 02H01 bar(g)',
       '02-PI-065 02H01 Draft mmH2O', '03-PI-045 03H01 Draft mmH2O',
       '03-FI-102 Tk 08 sweet naphtha to splitter m3/hr',
       '03-HV-005 02-H-01 , 03- H-01 dumper opening %',
       '03-TI-015 Charge heater bridge wall temp.',
       '03-TXI-017 B 03-H-01 Skin', '51-LI-018 TK08 level mm',
       '51-LI-019 TK09 level mm', ''],
      dtype='object', length=564)

In [130]:
df_2=pd.merge(df_1,df_Plat_macro,'outer',left_on='Date Design',right_on='DCS READINGS')

In [131]:
df_2.shape

(136, 869)

In [132]:
df_2.drop_duplicates().shape

(136, 869)

In [133]:
os.chdir(r'\\192.168.21.16\Area 02A sharing\Spread Sheets\spread sheets database by python')

In [134]:
df_2.to_csv(f'NHT_CCR_data_complete {str(start_date).split()[0]}_to_{str(end_date).split()[0]}.csv',index=False)

## Converting all data to numbers

In [135]:
df_2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136 entries, 0 to 135
Columns: 869 entries, Date Design to packinox skin temperature  c 04TXI607
dtypes: datetime64[ns](4), object(865)
memory usage: 924.4+ KB


In [138]:
# to convert object only ( not dates or numbers)
df_2[df_2.dtypes[df_2.dtypes=='object'].index].columns

Index(['NHT Feed 02 SC 01 Density 15C', 'NHT Feed 02 SC 01 Sulfur',
       'NHT Feed 02 SC 01 Sulfur', 'NHT Feed 02 SC 01 Sulfur',
       'NHT Feed 02 SC 01 Sulfur', 'NHT Feed 02 SC 01 Color',
       'NHT Feed 02 SC 01 Dien value', 'NHT Feed 02 SC 01 IBP',
       'NHT Feed 02 SC 01 0.05', 'NHT Feed 02 SC 01 0.1',
       ...
       '04-PDV-532 Lock Hopper Make up %', '04-XV-555 Lock Hopper Make up %',
       '04-PDV-525a Nitrogen to NSD %',
       'Reactor effluemt out of packinox barg 04PI605',
       'liquid HC feed to packinox  barg 04PI606',
       'recycle gas to packinox  barg 04PI607',
       'combined feed out of packinox  barg 04PI608',
       'combined feed steam pressure drop  barg 04PI607-04PI608',
       'effluent steam pressure drop  barg 04PI042-04PI605',
       'packinox skin temperature  c 04TXI607'],
      dtype='object', length=939)

In [139]:
df_3=df_2.copy()

In [144]:
df_3.shape

(136, 869)

In [145]:
df_3 = df_3.loc[:,~df_3.columns.duplicated()]

In [146]:
df_3.shape

(136, 836)

In [147]:
for column in df_3[df_3.dtypes[df_3.dtypes=='object'].index].columns:
    df_3[column]=pd.to_numeric(df_3[column],errors='coerce')

In [148]:
df_3.to_csv('orange_data.csv')