# Import packages

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats # check correlation
from sklearn.neighbors import LocalOutlierFactor # find local outlier

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, accuracy_score

from collections import Counter

import xgboost as xgb

from plotly import express as px

import pickle

# Get data

In [None]:
train_file = "25Wafer_10LayerVariation_SNR30db_train.csv"
test_file = "1Wafer_10LayerVariation_SNR30db_validate.csv"
train_data = pd.read_csv(train_file, sep=',', skiprows=range(4), header=None)
test_data = pd.read_csv(test_file, sep=',', skiprows=range(4),header=None)

#Data Mining

### missing value

In [None]:
print(train_data.isnull().sum().sum(), test_data.isnull().sum().sum())

In [None]:
train_data.dropna(inplace=True)
train_data

In [None]:
df_train = train_data.drop(columns=[601])
target = train_data[601]

In [None]:
describes = train_data.describe()
describes

In [None]:
mean_data_grouped = train_data.groupby([601]).mean()
mean_data_grouped

In [None]:
indexes = [int(i) for i in mean_data_grouped.index.values]

There are 131 classes from 200 to 1500

In [None]:
#describes.loc['mean'][:-1].plot.hist()
mean_data = describes.loc['mean']
mean_data[:-1].plot.box()

### Questions: Is there a trend for every row data?

In [None]:
def row_data(row, df):
  return pd.DataFrame(data=df.iloc[row,:], index = df.columns)

def scatter(data, color):
  return sns.scatterplot(data=data, palette=[color])

check mean, max, min and one sample trends

In [None]:
describes_mean = pd.DataFrame(data=describes.loc['mean'][:-1], index = describes.columns[:-1])
describes_max = pd.DataFrame(data=describes.loc['max'][:-1], index = describes.columns)
describes_min = pd.DataFrame(data=describes.loc['min'][:-1], index = describes.columns)

In [None]:
scatter(describes_mean, 'red')
scatter(describes_max, 'green')
scatter(describes_min, 'blue')
scatter(row_data(107, df_train), 'yellow')

check the difference of trends with different depth

In [None]:
scatter(pd.DataFrame(mean_data_grouped.loc[200,:]), 'yellow')
scatter(pd.DataFrame(mean_data_grouped.loc[600,:]), 'green')
scatter(pd.DataFrame(mean_data_grouped.loc[700,:]), 'blue')
scatter(pd.DataFrame(mean_data_grouped.loc[1500,:]), 'red')

##Conclusion

Yes, there is trend for row, and closer numbers of depth have similar graphs. 

Next step: find out outliers in each row not following the trend.

## Other algorithms creating new features and reducing dementions tried

1) Polynomial Regression(degree ranges from 2 to 60) for each row, use the coefficient data as training data. The prediction result is not better.

2) Polynomial Regression(degree ranges from 2 to 60) for the mean value of grouped data by depth (mean_data_grouped), use the coefficient data as training data.

## Outliers

###Ways tried filling outliers:

1) Find out outliers for each row using LocalOutlierFactor, replace outliers with the previous non-outlier value first and then the next non-outlier value in the row, because adjacent columns have similar data in each row shown in the graphs. 

In [None]:
# Outlier is data far away from local area, replace it with null

lof = LocalOutlierFactor()
def replace_outlier_None(df):
  df_outlier_replaced = df.copy()
  for i in range(len(df)):
    group = target[i]
    row_data = df.iloc[i,:].values.reshape(-1,1)
    yhat = lof.fit_predict(row_data)
    df_outlier_replaced.loc[i] = [df.loc[i,j] if yhat[j] == 1 else None 
                                  for j in range(len(yhat))]
  return df_outlier_replaced

# the following codes take long time to run, and the replace_outlier is not necessary, so they are commented
'''
replace_outlier_None = replace_outlier_None(df_train)
replace_outlier = replace_outlier_None.fillna(method='ffill', axis=1)
replace_outlier = replace_outlier.fillna(method='bfill', axis=1)
'''

2) Set outlier = median of the column.

3) Observe the graphs of samples with the greatest and the smallest depth, the main difference is from column 50 to 600, and most outliers lie before column 100 or after column 550, so remove the first 100 features or last 50 features. 

# Build models

## Qestions and Solutions

Question: 
Classifier models or Regression Models?

Thinking:
1. The target is numeric numbers correlated with features.
2. The number of samples 90,000+ is small for classification with 130+ categories.

Answer:
Regression model is better.


Question: 
Predicted depth from regression model is continuous numbers, can I change it to integer depth such as 20, 500, 1500?

Solution:
Compare difference between predicted depth with each integer number of real depth, select the real depth with the smallest difference as predicted result.

## Linear Regression

### mean absolute error

In [None]:
lr = LinearRegression()
X_train, X_val, y_train, y_val = train_test_split(df_train.values, target.values, test_size=0.2)
lr.fit(X_train, y_train)

print("train error: ", mean_absolute_error(y_train, lr.predict(X_train)))
print("validation error: ", mean_absolute_error(y_val, lr.predict(X_val)))

Compare regression result using original dataset with result using outlier replaced datasets

In [None]:
# this step needs replace_outlier file, the codes generating replace_outlier file are commented
'''
X_train, X_val, y_train, y_val = train_test_split(replace_outlier.values, target, test_size=0.2)
lr.fit(X_train, y_train)

print("train error: ", mean_absolute_error(y_train, lr.predict(X_train)))
print("validation error: ", mean_absolute_error(y_val, lr.predict(X_val)))
'''

Using outlier replaced dataset does not improve model result, so we'll use original dataset

## XGB Cross Validation and Hyperparameter Tuning

0) n_estimator: [300, 350, 400, 450, 500, 550] - 300 and 400 are the best

1) max_depth: range(4,12) ---> the best is 9

2) min_child_weight: [3, 5, 7, 9] ---> the best is 7

3) gamma: [0.07, 0.08, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6] --> the best is 0.07

4) 'subsample': [0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.6, 0.7, 0.8, 0.9] --> the best is subsample=0.8, colsample_bytree=0.9

5) 'reg_alpha': [0.02, 0.04, 0.05, 0.1, 1, 2, 3], 'reg_lambda': [0.01, 0.03, 0.04, 0.05, 0.06, 0.1, 1] --> the best is reg_alpha=0.04, reg_lambda=0.04

6) 'learning_rate': [0.01, 0.05, 0.07, 0.09, 0.1, 0.15, 0.17, 0.2, 0.23] --> the best is 0.09

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_train.values, target.values, test_size=0.2)
params = {
    'n_estimator': 400,
    'max_depth': 9,
    'min_child_weight': 7,
    'gamma': 0.1, 
    'subsample': 0.8,
    'colsample_bytree': 0.9,
    'reg_alpha': 0.04,
    'reg_lambda': 0.04,
    'learning_rate': 0.09,
    'booster': 'gbtree',
    'objective': 'reg:gamma',
    'lambda': 3,
    'silent': 1,
    'eta': 0.1,
    'seed': 1000,
    'nthread': 4
}

dtrain = xgb.DMatrix(X_train, y_train)
gridsearch_params = [
    (n_estimator, max_depth)
    for (n_estimator in range(300,600,20),
    max_depth in range(5,20,2))
]
min_mae = float("Inf")
best_params = None
for n_estimator, max_depth in gridsearch_params:
    print("CV with n_estimator={}, max_depth={}".format(
                             n_estimator, max_depth))
    # Update our parameters
    params['n_estimator'] = n_estimator
    params['max_depth'] = max_depth

    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=200,
        seed=42,
        nfold=3,
        metrics={'mae'},
        early_stopping_rounds=10
    )

    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = [n_estimator, max_depth]
print("Best params: {}, MAE: {}".format(best_params, min_mae))

##XGB Regression

Set the best hyperparamers to build XGB model

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df_train.values, target.values, test_size=0.2)
dtrain = xgb.DMatrix(X_train, y_train)

params = {
    'n_estimator': 400,
    'max_depth': 9,
    'min_child_weight': 7,
    'gamma': 0.07, 
    'subsample': 0.8,
    'colsample_bytree': 0.9,
    'reg_alpha': 0.04,
    'reg_lambda': 0.04,
    'learning_rate': 0.09,
    'booster': 'gbtree',
    'objective': 'reg:gamma',
    'lambda': 3,
    'silent': 1,
    'eta': 0.1,
    'seed': 1000,
    'nthread': 4
}

num_boost_rounds = 300 
plst = params.items()
model = xgb.train(plst, dtrain, num_boost_rounds)

### mean absolute error

In [None]:
dtrain = xgb.DMatrix(X_train)
dval = xgb.DMatrix(X_val)

print("train error: ", mean_absolute_error(y_train, model.predict(dtrain)))
print("validation error: ", mean_absolute_error(y_val, model.predict(dval)))

### Functions changing continuous numeric predicted depth into integer depth

In [None]:
def get_integer_depth(train_data, model):
  '''use model to predict train_data, getting continous numeric data, 
  change continous numeric depth into integer depth in targets'''
  data = [int(i) for i in mean_data_grouped.index.values]
  indexes = [int(i) for i in mean_data_grouped.index.values]
  groups = []
  predict_test = model.predict(train_data)
  for p in predict_test:
    df_target = pd.DataFrame(data=data, index=indexes, columns=['test_target'])
    df_target['test_target'] = abs(df_target['test_target'] - p)
    groups.append(df_target.sort_values(by=['test_target']).index[0])
  return groups

In [None]:
def prediction_result(test_data, target, predict_groups):
  '''get difference between prediction depth and real depth, 
  count number of samples for different difference'''
  df = pd.DataFrame()
  df['predict_depth'] = predict_groups
  df['true_depth'] = target
  df['difference'] = (df['predict_depth'] - df['true_depth'])
  differences_counts = df['difference'].value_counts()
  difference_result = pd.DataFrame({'predict_true_difference': differences_counts.index, 'number_of_samples': differences_counts.values})
  abs_differences_counts = abs((df['predict_depth'] - df['true_depth'])).value_counts()
  df_absolute_difference = pd.DataFrame({'absolute_difference': abs_differences_counts.index, 'number_of_samples': abs_differences_counts.values})
  return df, difference_result, df_absolute_difference

In [None]:
train_data = xgb.DMatrix(test_data.drop(columns=[601]).values)
predict_groups = get_integer_depth(train_data, model)

In [None]:
prediction_df, difference_result, df_absolute_difference = prediction_result(test_data, test_data[601], predict_groups)
difference_result['percentage'] = difference_result['number_of_samples'] * 100/len(test_data)
difference_result['cum_percentage'] = difference_result['percentage'].cumsum(skipna=False)
difference_result

In [None]:
plt.hist(prediction_df['true_depth'])
plt.hist(prediction_df['predict_depth'])

In [None]:
prediction_df[['predict_depth', 'true_depth', 'difference']].to_csv('prediction.csv')

In [None]:
difference_result.to_csv('prediction_performance.csv')

In [None]:
difference_result.to_csv('prediction.csv')

In [None]:
pickle.dump(model, open('model', 'wb'))

## Conclusion

34% of predicted depth is the same as the actual depth, 79% of predicted depth is 10 far from the actual depth, and 94% of predicted depth is 20 far from the actual depth.

# Visualizations of predicted and true depth

## Comparation of predicted depth and true depth

In [None]:
# scatter plot
prediction_df['absolute_difference<20'] = abs(prediction_df['difference']) < 20
fig = px.scatter(prediction_df, x='true_depth', y='predict_depth', marginal_x='histogram', marginal_y='histogram', 
                 color='absolute_difference<20', trendline='ols')
# hist grams
fig.update_traces(histnorm='probability', selector={'type':'histogram'})
# y = x line
y = test_data.iloc[:, :-1].values
fig.add_shape( type="line", line=dict(dash='dash'), x0=y.min(), y0=y.min(), x1=y.max(), y1=y.max())

## Percentage of samples with different distances from predicted depth to true depth

In [None]:
df_absolute_difference = df_absolute_difference.sort_values(by=['absolute_difference'])
df_absolute_difference['percentage'] = df_absolute_difference['number_of_samples'] * 100/len(test_data)
df_absolute_difference['total_samples'] = df_absolute_difference['number_of_samples'].cumsum(skipna=False)
df_absolute_difference['total_percentage'] = df_absolute_difference['percentage'].cumsum(skipna=False)

In [None]:
fig = px.bar(df_absolute_difference.iloc[:7,:], x='absolute_difference', y='total_samples',
             hover_data=['total_percentage'], color='total_percentage',
             labels={'total_samples':'total samples'}, height=400)
fig.show()