In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from feature_engine.imputation import MeanMedianImputer

from catboost import CatBoostRegressor, Pool, CatBoostClassifier
from sklearn import metrics
from sklearn.metrics import r2_score, f1_score

from feature_engine.outliers import OutlierTrimmer

In [3]:
df = pd.read_csv(r'C:\Users\1000257489\Documents\2021\20211022 Paris-C GC2 HDD ACC Down-trend Analysis\wasabi_V67\Func_wasabiV6_G7.zip', 
                 parse_dates=['DET_Date'])
df.shape

(6301666, 192)

In [4]:
df.head()

Unnamed: 0,enddate,testpgmver,hddsn,mtype,pfcode,asmline,dpcode,disksrc,diskopt,hgadwpn,...,IW_MD,IW_ID,KA_OD,KA_MD,KA_ID,IW_Peak_OD,IW_Peak_MD,IW_Peak_ID,pACC,iACC
0,20210903100741,PCMP061S,3RJESATA,PCMJ,0,LT,,SZB,1,0F35832,...,42.0,41.0,59.226768,51.000828,51.000828,106.0,93.0,92.0,101.186498,103.222652
1,20210903100741,PCMP061S,3RJESATA,PCMJ,0,LT,,SZB,1,0F35832,...,42.0,40.0,55.936392,51.000828,49.35564,102.0,93.0,89.0,102.13645,103.452345
2,20210903100741,PCMP061S,3RJESATA,PCMJ,0,LT,,SZB,1,0F35832,...,43.0,40.0,55.936392,52.646016,49.35564,102.0,96.0,89.0,105.281986,106.969991
3,20210903100741,PCMP061S,3RJESATA,PCMJ,0,LT,,SZB,1,0F35832,...,43.0,38.0,52.646016,52.646016,46.065264,96.0,96.0,84.0,106.319007,107.638775
4,20210903100741,PCMP061S,3RJESATA,PCMJ,0,LT,,SZB,1,0F35832,...,44.0,38.0,51.000828,52.646016,46.065264,93.0,97.0,84.0,103.252618,103.584029


In [5]:
df = df.select_dtypes(exclude='O')
df.shape

(6301666, 171)

In [6]:
# df.groupby('DET_Date')['DET_Date'].count().to_csv('list.csv')

In [7]:
df = df.groupby('DET_Date').filter(lambda x: len(x) >= 1000)
df.shape

MemoryError: Unable to allocate 7.65 GiB for an array with shape (163, 6301666) and data type float64

### Features Keeping

In [14]:
pd.Series(df.columns.tolist()).to_csv('cols.csv')

In [17]:
'func_date' in df.columns

False

In [18]:
# drop_vars = ['enddate','wafernum','hddcycle','asmdatetime','HD','u_fail_hd','DET_FW']
# df.drop(drop_vars, axis=1, inplace=True)

In [19]:
missing_var = [var for var in df.columns.tolist() if df[var].isnull().mean() > 0.5]
len(missing_var)

3

In [20]:
constant_var = [var for var in df.columns.tolist() if df[var].std() == 0]
len(std_var)

0

In [22]:
remove_var = missing_var + ['ADCKBpi_ID','ADCKBpi_MD','ADCKBpi_OD','ADCKTpi_ID','ADCKTpi_MD','ADCKTpi_OD','iACC', 'pACC', 
                            'DET_Date','sqzSER2R_ID','sqzSER2R_MD','sqzSER2R_OD','BPISER_MD', 'BPISER_OD', 'BPISER_ID',
                            'Final_SER']
feat = df.columns.tolist()
for var in remove_var:
    feat.remove(var)

len(feat)

145

### Imput data calculation

In [23]:
'DET_Date' in feat

False

In [25]:
input_df = pd.DataFrame()
for i in feat:
    input_df[i + '_P5'] = df.groupby('DET_Date')[i].quantile(.05)
    input_df[i + '_P95'] = df.groupby('DET_Date')[i].quantile(.95)
    input_df[i + '_median'] = df.groupby('DET_Date')[i].median()
    input_df[i + '_std'] = df.groupby('DET_Date')[i].std()

input_df.to_csv('input_G8.csv')
input_df.shape

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  


(120, 580)

In [26]:
acc_df = pd.DataFrame()
acc_df['Freq'] = df.groupby('DET_Date')['pACC'].count()
acc_df['pACC_mean'] = df.groupby('DET_Date')['pACC'].mean()
acc_df['pACC_median'] = df.groupby('DET_Date')['pACC'].median()

acc_df.to_csv('acc_G8.csv')

In [27]:
df_1 = pd.concat([acc_df, input_df], axis=1)

df_1.shape

(120, 583)

### added for acc by wafer model

In [None]:
# wafer data get
wf_df = pd.read_csv('D:\Data warehouse\PCM\Wafer\pcm_fact_data_r.zip')
wf_df.shape

In [None]:
remove_vars = list(wf_df.columns[wf_df.columns.str.contains('time|pro|FW|Error', na=False)]) + ['head_type_last','PRODUCT','head_type','LETA','High_KTCRA','Etch_G','MB','Group','Group_F','plot']

wf_df.drop(remove_vars, axis=1, inplace=True)

In [None]:
df = pd.merge(df_1, wf_df, left_on='wafernum', right_on='WAFERNUM', how='left')
df.shape

In [None]:
missing_var = [var for var in df.columns.tolist() if df[var].isnull().mean() > 0.5]
len(missing_var)

In [None]:
df.drop(missing_var, axis=1, inplace=True)
df.shape

In [None]:
constant_var = [var for var in df.columns.tolist() if df[var].std() == 0]
len(constant_var)

In [None]:
df.drop(constant_var, axis=1, inplace=True)
df.shape

In [None]:
df.to_csv('pcm_geno_by_wafer_1101.csv', index=False)