# Feature Engineering

---

1. Import packages
2. Load data
3. Feature engineering

---

## 1. Import packages

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.precision', 10)


---
## 2. Load data

In [None]:
df = pd.read_csv('./clean_data_after_eda.csv')
df["date_activ"] = pd.to_datetime(df["date_activ"], format='%Y-%m-%d')
df["date_end"] = pd.to_datetime(df["date_end"], format='%Y-%m-%d')
df["date_modif_prod"] = pd.to_datetime(df["date_modif_prod"], format='%Y-%m-%d')
df["date_renewal"] = pd.to_datetime(df["date_renewal"], format='%Y-%m-%d')

In [None]:
df.head(3)

In [None]:
df.info()

---

## 3. Feature engineering

### Difference between off-peak prices in December and preceding January

Below is the code created by your colleague to calculate the feature described above. Use this code to re-create this feature and then think about ways to build on this feature to create features with a higher predictive power.

In [None]:
price_df = pd.read_csv('price_data.csv')
price_df["price_date"] = pd.to_datetime(price_df["price_date"], format='%Y-%m-%d')
price_df.head()

In [None]:
price_df.info()

In [None]:
price_df.id.nunique()

In [None]:
# Group off-peak prices by companies and month
monthly_price_by_id = price_df.groupby(['id', 'price_date']).agg({'price_off_peak_var': 'mean', 'price_off_peak_fix': 'mean'}).reset_index()

# Get january and december prices
jan_prices = monthly_price_by_id.groupby('id').first().reset_index()
dec_prices = monthly_price_by_id.groupby('id').last().reset_index()

# Calculate the difference
diff = pd.merge(dec_prices.rename(columns={'price_off_peak_var': 'dec_1', 'price_off_peak_fix': 'dec_2'}), jan_prices.drop(columns='price_date'), on='id')
diff['offpeak_diff_dec_january_energy'] = diff['dec_1'] - diff['price_off_peak_var']
diff['offpeak_diff_dec_january_power'] = diff['dec_2'] - diff['price_off_peak_fix']
diff = diff[['id', 'offpeak_diff_dec_january_energy','offpeak_diff_dec_january_power']]
diff.head()

In [None]:
diff.info()

In [None]:
data = pd.merge(df, diff, how="left", on="id")
data = data.drop(columns=['Unnamed: 0'])
data.head()

In [None]:
data.info()

# ______________________________________________________________________
# ======================================================================
# ______________________________________________________________________

# Churn Model

### Preparing data

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
data['channel_sales'] = le.fit_transform(data['channel_sales'])
data['has_gas'] = le.fit_transform(data['has_gas'])
data['origin_up'] = le.fit_transform(data['origin_up'])


In [None]:
data['join_days'] = data['date_end']  - data['date_activ']
data['join_days'] = data['join_days'].astype(str).str.split(pat=' ').str[0].astype(int) 

data['diff_modfi_act_days'] = data['date_modif_prod']  - data['date_activ']
data['diff_modfi_act_days'] = data['diff_modfi_act_days'].astype(str).str.split(pat=' ').str[0].astype(int) 

data['diff_renw_act_days'] = data['date_renewal']  - data['date_activ']
data['diff_renw_act_days'] = data['diff_renw_act_days'].astype(str).str.split(pat=' ').str[0].astype(int) 

data['diff_end_renw_year'] = data['date_end']  - data['date_renewal']
data['diff_end_renw_year'] = data['diff_end_renw_year'].astype(str).str.split(pat=' ').str[0].astype(int) / 365

data['diff_end_modif_days'] = data['date_end']  - data['date_modif_prod']
data['diff_end_modif_days'] = data['diff_end_modif_days'].astype(str).str.split(pat=' ').str[0].astype(int)

In [None]:
from sklearn.model_selection import train_test_split

train , test = train_test_split(data.drop(columns=['date_end','date_activ','date_modif_prod','date_renewal'])
                                 ,random_state=104, test_size=0.2, shuffle=True)
train.head()

In [None]:
x = train.drop(columns=['id','churn'])
y = train.churn

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=104, test_size=0.15, shuffle=True)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score as acc
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

## Modeling

#### 1. Start with a Dummy Model (np.rand) - Baseline Model

In [None]:
m = y_test.shape[0]
rand_y_test = np.random.randint(2,size= m)
rand_y_test

In [None]:
random_accuarcy = acc(y_test,rand_y_test)
random_accuarcy

In [None]:
random_f1 = f1(y_test,rand_y_test)
random_f1

#### 2. Simple Model (linear)
- Linear model
- calculate score
- calculate feature imporance
- Simple model with top 10/20 features

In [None]:
model = LogisticRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred), f1(y_test, y_pred))

In [None]:
feature_imp = pd.DataFrame({"features":list(x_train.columns), "coef":list(model.coef_[0])}) 

In [None]:
feature_imp.plot(x="features", y="coef", kind="bar")

In [None]:
feature_imp.sort_values("coef")

In [None]:
top_15_col = feature_imp[np.abs(feature_imp["coef"])>0.0000155863]["features"]

In [None]:
x = train[list(top_15_col)]
y = train.churn

x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=104, test_size=0.15, shuffle=True)

model = LogisticRegression()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred), f1(y_test, y_pred))

In [None]:
x.info()

#### 3. Simple Model with Balanced Dataset

First : Upsampling

In [None]:
from imblearn.over_sampling import SMOTE


print('Before UpSampling, the shape of train_X: {}'.format(x_train.shape))
print('Before UpSampling, the shape of train_y: {} \n'.format(y_train.shape))

print("Before UpSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before UpSampling, counts of label '0': {} \n".format(sum(y_train==0)))

sm = SMOTE(sampling_strategy = 1 ,k_neighbors = 5, random_state=1)   #Synthetic Minority Over Sampling Technique
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())


print("After UpSampling, counts of label '1': {}".format(sum(y_train_res==1)))
print("After UpSampling, counts of label '0': {} \n".format(sum(y_train_res==0)))



print('After UpSampling, the shape of train_X: {}'.format(x_train_res.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(y_train_res.shape))

In [None]:
from sklearn import metrics


model = LogisticRegression()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

Second : Downsampling

In [65]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
x_rus, y_rus = rus.fit_resample(x_train, y_train)


print("After UpSampling, counts of label '1': {}".format(sum(y_rus==1)))
print("After UpSampling, counts of label '0': {} \n".format(sum(y_rus==0)))



print('After UpSampling, the shape of train_X: {}'.format(x_rus.shape))
print('After UpSampling, the shape of train_y: {} \n'.format(y_rus.shape))

After UpSampling, counts of label '1': 965
After UpSampling, counts of label '0': 965 

After UpSampling, the shape of train_X: (1930, 15)
After UpSampling, the shape of train_y: (1930,) 



In [66]:
from sklearn import metrics


model = LogisticRegression()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.5681688533941814 0.21554404145077719
[[892 694]
 [ 63 104]]
              precision    recall  f1-score   support

           0       0.93      0.56      0.70      1586
           1       0.13      0.62      0.22       167

    accuracy                           0.57      1753
   macro avg       0.53      0.59      0.46      1753
weighted avg       0.86      0.57      0.66      1753



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 4. Complex and Explainable Model (Tree Based)


First : Decision Tree

In [67]:
# DecisionTreeClassifier With original data
model = DecisionTreeClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.829435253850542 0.09667673716012085
[[1438  148]
 [ 151   16]]
              precision    recall  f1-score   support

           0       0.90      0.91      0.91      1586
           1       0.10      0.10      0.10       167

    accuracy                           0.83      1753
   macro avg       0.50      0.50      0.50      1753
weighted avg       0.83      0.83      0.83      1753



In [68]:
# DecisionTreeClassifier With Upsampling data
model = DecisionTreeClassifier()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.7974900171135196 0.1362530413625304
[[1370  216]
 [ 139   28]]
              precision    recall  f1-score   support

           0       0.91      0.86      0.89      1586
           1       0.11      0.17      0.14       167

    accuracy                           0.80      1753
   macro avg       0.51      0.52      0.51      1753
weighted avg       0.83      0.80      0.81      1753



In [69]:
# DecisionTreeClassifier With Downsampling data
model = DecisionTreeClassifier()
model.fit(x_rus,y_rus)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.5145464917284654 0.17618586640851888
[[811 775]
 [ 76  91]]
              precision    recall  f1-score   support

           0       0.91      0.51      0.66      1586
           1       0.11      0.54      0.18       167

    accuracy                           0.51      1753
   macro avg       0.51      0.53      0.42      1753
weighted avg       0.84      0.51      0.61      1753



Second : Random Forest

In [70]:
# RandomForestClassifier With Original data
model = RandomForestClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.9070165430690246 0.0790960451977401
[[1583    3]
 [ 160    7]]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1586
           1       0.70      0.04      0.08       167

    accuracy                           0.91      1753
   macro avg       0.80      0.52      0.52      1753
weighted avg       0.89      0.91      0.87      1753



In [71]:
# RandomForestClassifier With Upsampling data
model = RandomForestClassifier()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.87906446092413 0.09401709401709403
[[1530   56]
 [ 156   11]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      1586
           1       0.16      0.07      0.09       167

    accuracy                           0.88      1753
   macro avg       0.54      0.52      0.51      1753
weighted avg       0.84      0.88      0.86      1753



In [72]:
# RandomForestClassifier With Downsampling data
model = RandomForestClassifier()
model.fit(x_rus,y_rus)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.5670279520821448 0.18996798292422626
[[905 681]
 [ 78  89]]
              precision    recall  f1-score   support

           0       0.92      0.57      0.70      1586
           1       0.12      0.53      0.19       167

    accuracy                           0.57      1753
   macro avg       0.52      0.55      0.45      1753
weighted avg       0.84      0.57      0.66      1753



#### 5. Deeper Model (XGBoost,catboost)

First : XGBoost

In [73]:
# XGBClassifier With original data
model = XGBClassifier()
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8996006845407872 0.043478260869565216
[[1573   13]
 [ 163    4]]
              precision    recall  f1-score   support

           0       0.91      0.99      0.95      1586
           1       0.24      0.02      0.04       167

    accuracy                           0.90      1753
   macro avg       0.57      0.51      0.50      1753
weighted avg       0.84      0.90      0.86      1753



In [74]:
# XGBClassifier With Upsampling data
model = XGBClassifier()
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8796349115801483 0.11715481171548119
[[1528   58]
 [ 153   14]]
              precision    recall  f1-score   support

           0       0.91      0.96      0.94      1586
           1       0.19      0.08      0.12       167

    accuracy                           0.88      1753
   macro avg       0.55      0.52      0.53      1753
weighted avg       0.84      0.88      0.86      1753



In [75]:
# XGBClassifier With Downsampling data
model = XGBClassifier()
model.fit(x_rus,y_rus)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.5430690245293782 0.1917255297679112
[[857 729]
 [ 72  95]]
              precision    recall  f1-score   support

           0       0.92      0.54      0.68      1586
           1       0.12      0.57      0.19       167

    accuracy                           0.54      1753
   macro avg       0.52      0.55      0.44      1753
weighted avg       0.85      0.54      0.63      1753



Second : CatBoost

In [78]:
# CatBoostClassifier With original data
model = CatBoostClassifier(verbose=0)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.9053051911009697 0.03488372093023255
[[1584    2]
 [ 164    3]]
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      1586
           1       0.60      0.02      0.03       167

    accuracy                           0.91      1753
   macro avg       0.75      0.51      0.49      1753
weighted avg       0.88      0.91      0.86      1753



In [80]:
# CatBoostClassifier With Upsampling data
model = CatBoostClassifier(verbose=0)
model.fit(x_train_res,y_train_res)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.8876212207644039 0.10859728506787329
[[1544   42]
 [ 155   12]]
              precision    recall  f1-score   support

           0       0.91      0.97      0.94      1586
           1       0.22      0.07      0.11       167

    accuracy                           0.89      1753
   macro avg       0.57      0.52      0.52      1753
weighted avg       0.84      0.89      0.86      1753



In [81]:
# CatBoostClassifier With Downsampling data
model = CatBoostClassifier(verbose=0)
model.fit(x_rus,y_rus)
y_pred = model.predict(x_test)
print(acc(y_test,y_pred),
       f1(y_test, y_pred))

print(metrics.confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test, y_pred))

0.573873359954364 0.19763694951664876
[[914 672]
 [ 75  92]]
              precision    recall  f1-score   support

           0       0.92      0.58      0.71      1586
           1       0.12      0.55      0.20       167

    accuracy                           0.57      1753
   macro avg       0.52      0.56      0.45      1753
weighted avg       0.85      0.57      0.66      1753



### The Best Model is : CatBoostClassifier With Downsampling data