## Task **2.2** Machine **Learning**

We want to predict whether the exposed group have an impact on the brand awareness.

The target – is the conversion rate in exposed group greater than control group, in which case our target variable can be either a "yes" or a "no " variable
Variables/features – [experiment, device make, day_of_week, hour, browser] are the features that can be used to identify patterns to predict the target answer.

In [4]:
# import necessary library
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import preprocessing 
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
#import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
import sklearn.utils
import dvc.api
# suppress warnings 
import warnings
warnings.simplefilter("ignore")

In [5]:
path = 'data/AdSmartABdata2.csv'
repo = '/home/bethelhem/abtest-mlops'
version = 'v1'
data_url = dvc.api.get_url(
  path = path,
  repo = repo,
  rev=version
  )


df1 = pd.read_csv(data_url)
df1.head()

PathMissingError: The path 'data/AdSmartABdata2.csv' does not exist in the target repository '/home/bethelhem/abtest-mlops' neither as a DVC output nor as a Git-tracked file.

In [2]:
#load data
df = pd.read_csv('../data/AdSmartABdata.csv')
df.head()

Unnamed: 0,auction_id,experiment,date,hour,device_make,browser,yes,no
0,0008ef63-77a7-448b-bd1e-075f42c55e39,exposed,2020-07-10,8,Generic Smartphone,Chrome Mobile,0,0
1,000eabc5-17ce-4137-8efe-44734d914446,exposed,2020-07-07,10,Generic Smartphone,Chrome Mobile,0,0
2,0016d14a-ae18-4a02-a204-6ba53b52f2ed,exposed,2020-07-05,2,E5823,Chrome Mobile WebView,0,1
3,00187412-2932-4542-a8ef-3633901c98d9,control,2020-07-03,15,Samsung SM-A705FN,Facebook,0,0
4,001a7785-d3fe-4e11-a344-c8735acacc2c,control,2020-07-03,15,Generic Smartphone,Chrome Mobile,0,0


##### preprocessing the data

In [None]:
#checking for missing values
df.isna().sum()

In [None]:
# Create a Day of Week feature
df['date']=pd.to_datetime(df['date'])
df.head()

In [None]:
# Create a Day of Week feature
df['day_of_week'] = df['date'].dt.day_name()
# Remove Date column
del df['date']
df.head()

In [None]:
#users who have answered the questionnaire
ml_data =df[np.logical_or(df['yes']==1,df['no']==1)]
ml_data = ml_data.reset_index(drop=True)
ml_data.shape

In [None]:
# label_encoder 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in all columns. 
ml_data['experiment']= label_encoder.fit_transform(ml_data['experiment'])
ml_data['device_make']= label_encoder.fit_transform(ml_data['device_make'])
ml_data['browser']= label_encoder.fit_transform(ml_data['browser'])
ml_data['day_of_week']= label_encoder.fit_transform(ml_data['day_of_week'])


In [None]:
ml_data.head()

In [None]:
ml_data.drop('auction_id', axis=1, inplace=True)
ml_data.head()

In [None]:
#scaling up our data
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler()
ml_data[:] = scaling.fit_transform(ml_data[:])
ml_data.head()

In [None]:
# Shuffle the data
ml_data = sklearn.utils.shuffle(ml_data)


In [None]:
#Split the data into 70% training, 20% validation, and 10% test sets. 
train, validate, test = np.split(ml_data, [int(.7*len(ml_data)), int(.9*len(ml_data))])
print(train.shape)
print(validate.shape)
print(test.shape)


In [None]:
X_train = train[['experiment', 'day_of_week', 'hour', 'device_make', 'platform_os', 'browser']]
Y_train = train['yes']

X_test= test[['experiment', 'date', 'hour', 'device_make', 'platform_os', 'browser']]
Y_test = test['yes'] 

X_valid = validate[['experiment', 'date', 'hour', 'device_make', 'platform_os', 'browser']]
Y_valid = validate['yes']

#### Modeling

In [None]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=0)

#### Logistic regression

In [None]:
from sklearn import linear_model
import scipy.stats as stat
import numpy as np
from sklearn.metrics import log_loss

class LogisticReg:
  
    def __init__(self):
        self.model = linear_model.LogisticRegression()

    def fit(self,X,y):
        self.X = X
        self.y = y
        self.model.fit(X,y)
        #### Get p-values for the fitted model ####
        denom = (2.0*(1.0+np.cosh(self.model.decision_function(X))))
        denom = np.tile(denom,(X.shape[1],1)).T
        F_ij = np.dot((X/denom).T,X) ## Fisher Information Matrix
        Cramer_Rao = np.linalg.inv(F_ij) ## Inverse Information Matrix
        sigma_estimates = np.sqrt(np.diagonal(Cramer_Rao))
        z_scores = self.model.coef_[0]/sigma_estimates # z-score for eaach model coefficient
        p_values = [stat.norm.sf(abs(x))*2 for x in z_scores] ### two tailed test for p-values
        
        self.z_scores = z_scores
        self.p_values = p_values
        self.sigma_estimates = sigma_estimates
        self.F_ij = F_ij
    
    def get_summary(self):
        summary= pd.DataFrame()
        summary["features"] = self.X.columns
        summary["z_score"] = self.z_scores
        summary["p_value"] = self.p_values
        sns.barplot(summary["features"],summary["p_value"], data=summary)
        return summary
    def get_predicate(self,test):
        self.test = test       
        return self.model.predict(test)

    def get_accuracy(self,test_data,test_targ,k_fold):
        return cross_val_score(self.model,test_data, test_targ,cv=k_fold, scoring= 'accuracy').mean()
        
    def get_loss(self,valid_data,valid_targ,k_fold):
        return -(cross_val_score(self.model,valid_data, valid_targ,cv=k_fold, scoring= 'neg_log_loss').mean())
    def get_eff_model(self,test_data,test_targ,kfold):
        scoring = ["accuracy","roc_auc","neg_log_loss","r2",
             "neg_mean_squared_error","neg_mean_absolute_error"] 

        metrics = pd.DataFrame()
        metrics["model"] = ["Logistic regression"]
        for scor in scoring:
            score = []
            result = model_selection.cross_val_score(estimator= self.model, X=test_data, y=test_targ,cv=kfold,scoring=scor )
            score.append(result.mean())
            
            metrics[scor] =pd.Series(score)
        
        return metrics




#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
import scipy.stats as stat
import numpy as np
from sklearn.metrics import log_loss

class Decision_Tree:
  
    def __init__(self):
        self.model = DecisionTreeClassifier()

    def fit(self,X,y):
        self.X = X
        self.y = y
        self.model.fit(X,y)

    def get_predicate(self,test):
        self.test = test       
        return self.model.predict(test)

    def get_accuracy(self,pred,test_data,test_targ,k_fold):
        return cross_val_score(self.model,test_data, test_targ,cv=k_fold, scoring= 'accuracy').mean()
        
    def get_loss(self,valid_data,valid_targ,k_fold):
        return -(cross_val_score(self.model,valid_data, valid_targ,cv=k_fold, scoring= 'neg_log_loss').mean())
    def get_eff_model(self,test_data,test_targ,kfold):
        scoring = ["accuracy","roc_auc","neg_log_loss","r2",
             "neg_mean_squared_error","neg_mean_absolute_error"] 

        metrics = pd.DataFrame()
        metrics["model"] = ["Decission Tree"]
        for scor in scoring:
            score = []
            result = model_selection.cross_val_score(estimator= self.model, X=test_data, y=test_targ,cv=kfold,scoring=scor )
            score.append(result.mean())
            
            metrics[scor] =pd.Series(score)
        
        return metrics
    def get_feature_impo(self):
        feat_importance = pd.Series(self.model.feature_importances_, index=self.X.columns)
        feat_importance.plot(kind='bar')
        plt.show()
        return feat_importance



#### XGBoost

In [None]:
import scipy.stats as stat
import numpy as np
from sklearn.metrics import log_loss

class XG_Boost:
  
    def __init__(self):
      data_dmatrix = xgb.DMatrix(data=train_invar,label=train_targ)
      self.model = xgb.XGBClassifier()

    def fit(self,X,y):
        self.X = X
        self.y = y
        self.model.fit(X,y)

    def get_predicate(self,test):
        self.test = test       
        return self.model.predict(test)

    def get_accuracy(self,pred,test_data,test_targ,k_fold):
        return cross_val_score(self.model,test_data, test_targ,cv=k_fold, scoring= 'accuracy').mean()
        
    def get_loss(self,valid_data,valid_targ,k_fold):
        return -(cross_val_score(self.model,valid_data, valid_targ,cv=k_fold, scoring= 'neg_log_loss').mean())
    def get_eff_model(self,test_data,test_targ,kfold):
        scoring = ["accuracy","roc_auc","neg_log_loss","r2",
             "neg_mean_squared_error","neg_mean_absolute_error"] 

        metrics = pd.DataFrame()
        metrics["model"] = ["XG_Boost"]
        for scor in scoring:
            score = []
            result = model_selection.cross_val_score(estimator= self.model, X=test_data, y=test_targ,cv=kfold,scoring=scor )
            score.append(result.mean())
            
            metrics[scor] =pd.Series(score)
        
        return metrics
    def get_feature_impo(self):
        feat_importance = pd.Series(self.model.feature_importances_, index=self.X.columns)
        feat_importance.plot(kind='bar')
        plt.show()
        return feat_importance

### Analysis

In [None]:
#create models
log_model = LogisticReg()
decision_tree = Decision_Tree()
xg_boost = XG_Boost()

#Train the model
log_model.fit(train_invar,train_targ)
decision_tree.fit(train_invar,train_targ)
xg_boost.fit(train_invar,train_targ)


#### importance of features

In [None]:
#feature importance in logistic regression
log_model.get_summary()


In [None]:
#feature importance in decision
decision_tree.get_feature_impo()

In [None]:
#feature importance in logistic regression
xg_boost.get_feature_impo()

#### Compare the efficency of the model

In [None]:
from sklearn import model_selection

def eff_models(model, train_data, train_targ, kfold):
    scoring = ["accuracy","roc_auc","neg_log_loss","r2"] 

    eff_models = pd.DataFrame()
    eff_models["model"] = model
    for scor in scoring:
        score = []
        for mod in model:
           
            result = model_selection.cross_val_score(estimator= mod, X=train_data, y=train_targ,cv=kfold,scoring=scor )
            score.append(result.mean())
            
        eff_models[scor] =pd.Series(score)
        
    return eff_models

In [None]:

error_metrics(model=[log_model.model,decision_tree.model,xg_boost.model],train_data = train_invar, train_targ = train_targ, kfold=k_fold)

#### validate the **data**

In [None]:
pred = log_model.get_predicate( valid_invar)

pre = pd.DataFrame()
pre["predicate"]=pred
pre["actual"] = train_targ

corr_mat = pre.corr()
top_corr_features = corr_mat.index
plt.figure(figsize=(10,10))
#plot heat map
sns.heatmap(pre[top_corr_features].corr(), annot= True)

log_model.get_eff_model(valid_invar,valid_targ,k_fold)


In [None]:
pred = decision_tree.get_predicate( valid_invar)

pre = pd.DataFrame()
pre["predicate"]=pred
pre["actual"] = train_targ

corr_mat = pre.corr()
top_corr_features = corr_mat.index
plt.figure(figsize=(10,10))
#plot heat map
sns.heatmap(pre[top_corr_features].corr(), annot= True)

decision_tree.get_eff_model(valid_invar,valid_targ,k_fold)

In [None]:
pred = xg_boost.get_predicate( valid_invar)

pre = pd.DataFrame()
pre["predicate"]=pred
pre["actual"] = train_targ

corr_mat = pre.corr()
top_corr_features = corr_mat.index
plt.figure(figsize=(10,10))
#plot heat map
sns.heatmap(pre[top_corr_features].corr(), annot= True)

xg_boost.get_eff_model(valid_invar,valid_targ,k_fold)

### Test the data

In [None]:
from sklearn import metrics
pred = log_model.get_predicate( valid_invar)

