In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score,confusion_matrix

In [73]:
data = pd.read_csv("D:\Amzen Problem HFCL\\artifacts\\raw.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Pat_Id,Copay amount,Refill_date,Optin,Card_Status,Channel,Next_refill_date,card no
0,0,1,1745.71,2014-02-21,Y,Expired,Virtual,2014-03-23,1-1
1,1,2,35.0,2014-01-01,Y,Expired,Virtual,2014-01-31,1-2
2,2,3,50.0,2014-08-15,N,Active,Virtual,2014-09-14,1-3
3,3,3,50.0,2014-09-27,N,Active,Virtual,2014-12-26,1-3
4,4,3,50.0,2014-10-29,N,Active,Virtual,2015-01-27,1-3


## Create a new target variable 'Drop' based on 'Optin' and 'Card_Status' and 
###  We will assume that a patient is considered to have dropped if they have 'N' in 'Optin' or 'Expired' in 'Card_Status'
#### We will use 1 to indicate a drop and 0 to indicate no drop

In [74]:
data["Optin"]=data["Optin"].map({
    "N":1,
    "Y":0
})    


In [75]:
data.head()

Unnamed: 0.1,Unnamed: 0,Pat_Id,Copay amount,Refill_date,Optin,Card_Status,Channel,Next_refill_date,card no
0,0,1,1745.71,2014-02-21,0,Expired,Virtual,2014-03-23,1-1
1,1,2,35.0,2014-01-01,0,Expired,Virtual,2014-01-31,1-2
2,2,3,50.0,2014-08-15,1,Active,Virtual,2014-09-14,1-3
3,3,3,50.0,2014-09-27,1,Active,Virtual,2014-12-26,1-3
4,4,3,50.0,2014-10-29,1,Active,Virtual,2015-01-27,1-3


In [76]:
data.head()

Unnamed: 0.1,Unnamed: 0,Pat_Id,Copay amount,Refill_date,Optin,Card_Status,Channel,Next_refill_date,card no
0,0,1,1745.71,2014-02-21,0,Expired,Virtual,2014-03-23,1-1
1,1,2,35.0,2014-01-01,0,Expired,Virtual,2014-01-31,1-2
2,2,3,50.0,2014-08-15,1,Active,Virtual,2014-09-14,1-3
3,3,3,50.0,2014-09-27,1,Active,Virtual,2014-12-26,1-3
4,4,3,50.0,2014-10-29,1,Active,Virtual,2015-01-27,1-3


In [77]:
data['Drop'] = ((data['Optin'] == 'N') | (data['Card_Status'] == 'Expired')).astype(int)

In [78]:
data.head()

Unnamed: 0.1,Unnamed: 0,Pat_Id,Copay amount,Refill_date,Optin,Card_Status,Channel,Next_refill_date,card no,Drop
0,0,1,1745.71,2014-02-21,0,Expired,Virtual,2014-03-23,1-1,1
1,1,2,35.0,2014-01-01,0,Expired,Virtual,2014-01-31,1-2,1
2,2,3,50.0,2014-08-15,1,Active,Virtual,2014-09-14,1-3,0
3,3,3,50.0,2014-09-27,1,Active,Virtual,2014-12-26,1-3,0
4,4,3,50.0,2014-10-29,1,Active,Virtual,2015-01-27,1-3,0


In [79]:
data.drop(["Unnamed: 0"],axis=1)

Unnamed: 0,Pat_Id,Copay amount,Refill_date,Optin,Card_Status,Channel,Next_refill_date,card no,Drop
0,1,1745.71,2014-02-21,0,Expired,Virtual,2014-03-23,1-1,1
1,2,35.00,2014-01-01,0,Expired,Virtual,2014-01-31,1-2,1
2,3,50.00,2014-08-15,1,Active,Virtual,2014-09-14,1-3,0
3,3,50.00,2014-09-27,1,Active,Virtual,2014-12-26,1-3,0
4,3,50.00,2014-10-29,1,Active,Virtual,2015-01-27,1-3,0
...,...,...,...,...,...,...,...,...,...
55752,10005,35.00,2014-05-01,1,Active,Virtual,2014-05-31,2-10005,0
55753,10005,80.00,2014-07-01,1,Active,Virtual,2014-07-31,2-10005,0
55754,10005,45.00,2014-08-01,1,Active,Virtual,2014-08-31,2-10005,0
55755,10005,45.00,2014-09-01,1,Active,Virtual,2014-11-30,2-10005,0


In [80]:
data["Drop"].value_counts()

Drop
0    48003
1     7754
Name: count, dtype: int64

In [81]:
data["Drop"].value_counts()

Drop
0    48003
1     7754
Name: count, dtype: int64

### Separate features and target variable

In [82]:
X = data.drop(['Pat_Id','Drop'], axis=1)
y = data['Drop']

### Define numerical and categorical features

In [83]:
# Segregating numerical and categorical variables
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns

In [84]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [85]:
X.columns

Index(['Unnamed: 0', 'Copay amount', 'Refill_date', 'Optin', 'Card_Status',
       'Channel', 'Next_refill_date', 'card no'],
      dtype='object')

In [86]:
numerical_features

Index(['Unnamed: 0', 'Copay amount', 'Optin'], dtype='object')

In [87]:
categorical_features

Index(['Refill_date', 'Card_Status', 'Channel', 'Next_refill_date', 'card no'], dtype='object')

In [88]:
X.head()

Unnamed: 0.1,Unnamed: 0,Copay amount,Refill_date,Optin,Card_Status,Channel,Next_refill_date,card no
0,0,1745.71,2014-02-21,0,Expired,Virtual,2014-03-23,1-1
1,1,35.0,2014-01-01,0,Expired,Virtual,2014-01-31,1-2
2,2,50.0,2014-08-15,1,Active,Virtual,2014-09-14,1-3
3,3,50.0,2014-09-27,1,Active,Virtual,2014-12-26,1-3
4,4,50.0,2014-10-29,1,Active,Virtual,2015-01-27,1-3


### Create the preprocessing pipelines for both numerical and categorical data

In [89]:
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())])
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [90]:
numerical_pipeline

In [91]:
categorical_pipeline

### Combine preprocessing steps

In [92]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)])


In [93]:
preprocessor

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [95]:
X_train=preprocessor.fit_transform(X_train)
X_test=preprocessor.transform(X_test)

In [96]:
pd.DataFrame(X_test)

Unnamed: 0,0
0,"(0, 0)\t-0.8522785967517718\n (0, 1)\t4.949..."
1,"(0, 0)\t0.9217834341959243\n (0, 1)\t-0.374..."
2,"(0, 0)\t-0.9570448155912512\n (0, 1)\t1.405..."
3,"(0, 0)\t-1.4422025637244609\n (0, 1)\t-0.30..."
4,"(0, 0)\t-1.5528133075318402\n (0, 1)\t-0.19..."
...,...
11147,"(0, 0)\t-0.25744274177414556\n (0, 1)\t-0.2..."
11148,"(0, 0)\t-1.3595087530084327\n (0, 1)\t-0.36..."
11149,"(0, 0)\t-0.7897919202332692\n (0, 1)\t-0.36..."
11150,"(0, 0)\t-1.6286677805194751\n (0, 1)\t-0.21..."


In [97]:
feature_names = preprocessor.get_feature_names_out()

In [98]:
feature_names

array(['num__Unnamed: 0', 'num__Copay amount', 'num__Optin', ...,
       'cat__card no_3-9874', 'cat__card no_4-1992',
       'cat__card no_4-9874'], dtype=object)

In [99]:
# X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
# X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

## Model Training Automation

In [100]:
models={
    'Random Forest':RandomForestClassifier(),
    'Logistic Regression':LogisticRegression(),
    'Decision Tree':DecisionTreeClassifier(),
    'KNeighbors Classifier':KNeighborsClassifier(n_neighbors=5,algorithm="auto"),
   }

In [101]:
def evaluate_model(X_train,y_train,X_test,y_test,models):
    report = {}
    for i in range(len(models)):
        model = list(models.values())[i]
        model.fit(X_train,y_train)
        y_test_pred =model.predict(X_test)
        test_model_score = accuracy_score(y_test,y_test_pred)
        report[list(models.keys())[i]] =  test_model_score
    return report

In [102]:
evaluate_model(X_train,y_train,X_test,y_test,models)

{'Random Forest': 1.0,
 'Logistic Regression': 1.0,
 'Decision Tree': 1.0,
 'KNeighbors Classifier': 0.9950681492109039}

In [103]:
classfier=RandomForestClassifier()

## we chose Random Forest classifier for model predection, It's very usefull for overfiting problems as compair to another models.

In [104]:
from sklearn.model_selection import RandomizedSearchCV

## Hypeparameter Tuning

In [105]:
params={'max_depth':[3,5,10,None],
              'n_estimators':[100,200,300],
               'criterion':['gini','entropy']
              }

In [106]:
cv=RandomizedSearchCV(classfier,param_distributions=params,scoring='accuracy',cv=5,verbose=3)
cv.fit(X_train,y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[CV 1/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=1.000 total time=  25.7s
[CV 2/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=1.000 total time=  25.1s
[CV 3/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=1.000 total time=  23.6s
[CV 4/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=1.000 total time=  22.4s
[CV 5/5] END criterion=entropy, max_depth=None, n_estimators=100;, score=1.000 total time=  19.6s
[CV 1/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.860 total time=   3.2s
[CV 2/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.860 total time=   3.1s
[CV 3/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.860 total time=   3.1s
[CV 4/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.860 total time=   3.3s
[CV 5/5] END criterion=entropy, max_depth=10, n_estimators=100;, score=0.860 total time=   3.3s
[CV 1/5] END criterion=entropy

In [107]:
cv.best_params_

{'n_estimators': 100, 'max_depth': None, 'criterion': 'entropy'}

In [108]:
model = RandomForestClassifier(n_estimators=100,criterion="gini")

In [109]:
model.fit(X_train,y_train)

### Evaluate the classifier

In [110]:
y_prediction=model.predict(X_test)
accuracy=accuracy_score(y_test, y_prediction)
classification_report=classification_report(y_test, y_prediction)
confusion_matrix=confusion_matrix(y_test,y_prediction)

In [111]:
print(f"Accuracy: {accuracy}")
print(f"classification report\n: {classification_report}")
print(f"confusion matrix\n: {confusion_matrix}")

Accuracy: 1.0
classification report
:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      9632
           1       1.00      1.00      1.00      1520

    accuracy                           1.00     11152
   macro avg       1.00      1.00      1.00     11152
weighted avg       1.00      1.00      1.00     11152

confusion matrix
: [[9632    0]
 [   0 1520]]
