In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score


import os
import pickle

## Module 1

In [3]:
class operationOS:
    
    def __init__(self):
        pass
    
    def currentDirec(self):
        self.path = os.getcwd()
        return self.path
    
    def newFolder(self,dirName):
        '''
        Return path for created new folder
        
        '''
        self.dirName = dirName
        self.path = operationOS().currentDirec()
        self.new_path = os.path.join(self.path,self.dirName)
        if not os.path.isdir(self.new_path):
            os.makedirs(self.new_path)
            
        return self.new_path
            
    

## Module 2

In [4]:
class preprocessing:
    
    def __init__(self):
        pass
    
    def removeColumn(self,data,columnName):
        """ 
        This mehod remove column mentioned from dataset name mentioned
        column : string indicating column name
        data : varibale name of dataframe
        
        """
        self.data = data
        self.column = columnName
        self.new_data = self.data.drop(self.column , axis = 1)
        
        return self.new_data
    
    def labelDefine(self,data,columnName,new_columnName):
        """
        This is method define label to affair dataset
        column : last columns from label should define
        data : dataset into which label should define
        """
        self.data = data
        self.column = columnName
        self.new_column = new_columnName
        self.data[self.new_column] = np.where(self.data[self.column]>0 ,1,0)
        
        return self.data
    
    def ReportMissingValue(self,FolderName,Filename,data):
        """
        This method make txt file in which information regrarding missing value 
        for each columns mentioned at given folder name and filename
        
        """
        self.FolderName = FolderName
        self.Filename   = Filename
        self.data = data
        
        self.objectFolder = operationOS().newFolder(self.FolderName)
        
        file = os.path.join(self.objectFolder,self.Filename)
        
        for k,i in enumerate(self.data.isnull().sum()):
            with open(file , "a") as f:
                f.write(self.data.isnull().sum().index[k] + "..........." + str(i) + "\n")
        
        
    def ReplaceMissing(self,data):
        """
        For now replacing missing values using median if any present
        
        """
        self.data = data
        for column in self.data.columns:
            self.data[column] = np.where(self.data[column] == np.nan , self.data[column].median() , self.data[column])
            
        return self.data
    
    
        

## Module 3 

In [5]:
class DataSplit:
    
    def __init__(self,model_folder,scaler_file):
        self.model_folder = model_folder
        self.scaler_file = scaler_file
        objectModel = operationOS()
        self.newFolder  = objectModel.newFolder(self.model_folder)
        self.filename = self.scaler_file + ".sav"
        self.modelpath = os.path.join(self.newFolder,self.filename)
        
    def TrainTest(self,data):
        self.data = data
        self.train , self.test = train_test_split(self.data , test_size = 0.30 , random_state = 42)
        return self.train , self.test
    
    def Featurelabel(self,data):
        self.data = data
        self.feature = self.data.iloc[:,0:self.data.shape[1]-1]
        self.label = self.data.iloc[:,-1]
        return self.feature, self.label
    
    def transferFeatureTrain(self,features):
        self.features = features
        scaller = StandardScaler()
        self.x_train = scaller.fit_transform(self.features)
        pickle.dump(scaller,open(self.modelpath ,"wb"))
        return self.x_train
    
    def transferFeatureTest(self,features):
        self.features = features
        scaller = pickle.load(open(self.modelpath,"rb"))
        self.x_test = scaller.transform(self.features)
        return self.x_test
         
        
        

## Module 4

In [6]:
class Training(DataSplit):
    
    def __init__(self,x_train,y_train,x_test,y_test):
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
    
    def modelTraining(self):
        
        logi1  = LogisticRegression(verbose=1 ,n_jobs=4, penalty = 'l2',solver='lbfgs' )
        logi2  = LogisticRegression(verbose=1 ,n_jobs=4 ,penalty = 'l1',solver='liblinear' ) 
        logi3  = LogisticRegression(verbose=1 ,n_jobs=4 ,penalty = 'l2',solver='liblinear' )
        logi4  = LogisticRegression(verbose=1 ,n_jobs=4 ,solver='newton-cg' )
        logi5  = LogisticRegression(verbose=1 ,n_jobs=4 ,solver='liblinear' )
        logi6  = LogisticRegression(verbose=1 ,n_jobs=4 ,penalty = 'none',solver='sag')
        
        logi1.fit(self.x_train,self.y_train)
        logi2.fit(self.x_train,self.y_train)
        logi3.fit(self.x_train,self.y_train)
        logi4.fit(self.x_train,self.y_train)
        logi5.fit(self.x_train,self.y_train)
        logi6.fit(self.x_train,self.y_train)
        
        y_predi_logi1 = logi1.predict(self.x_test)
        y_predi_logi2 = logi2.predict(self.x_test)
        y_predi_logi3 = logi3.predict(self.x_test)
        y_predi_logi4 = logi4.predict(self.x_test)
        y_predi_logi5 = logi5.predict(self.x_test)
        y_predi_logi6 = logi6.predict(self.x_test)
        
        auc_logi1 = roc_auc_score(y_test, y_predi_logi1)
        auc_logi2 = roc_auc_score(y_test, y_predi_logi2)
        auc_logi3 = roc_auc_score(y_test, y_predi_logi3)
        auc_logi4 = roc_auc_score(y_test, y_predi_logi4)
        auc_logi5 = roc_auc_score(y_test, y_predi_logi5)
        auc_logi6 = roc_auc_score(y_test, y_predi_logi6)
        
        list_model_auc = [auc_logi1,auc_logi2,auc_logi3,auc_logi4,auc_logi5,auc_logi6]
        list_model = [logi1,logi2,logi3,logi4,logi5,logi6]
        
        finalized_model = list_model[list_model_auc.index(max(list_model_auc))]
        
        super().__init__("Models","Logistic")
        
        pickle.dump(finalized_model,open(self.modelpath ,"wb"))
        
        return "AUC SCORE = {}".format(max(list_model_auc)) + "......" + "Accuracy = {}".format(finalized_model.score(self.x_test, self.y_test))
        
        

## Module 5

In [7]:
class Prediction(DataSplit):
    
    def __init__(self,x_train,y_train,x_test,y_test):
        self.x_train = x_train
        self.y_train = y_train
        self.x_test = x_test
        self.y_test = y_test
        
    def RandomData(self,featuresList):
        self.featuresList = featuresList
        predictObject = DataSplit("Models","scaler")
        featureVector = predictObject.transferFeatureTest(self.featuresList)
        
        super().__init__("Models","Logistic")
        
        modelPredict = pickle.load(open(self.modelpath ,"rb"))
        output = modelPredict.predict(featureVector)
        
        if output == 0:
            return "You are safe! , No affairs"
        else:
            return "Ohh jesus!! Blessed me"

## app.py

#### Data preprocessing

In [8]:
dta = sm.datasets.fair.load_pandas().data

In [9]:
data_1 = preprocessing().labelDefine(dta,"affairs","affair")

In [10]:
data_2 = preprocessing().removeColumn(dta,"affairs")

In [11]:
preprocessing().ReportMissingValue("Report","MissingInfo",data_2)

In [12]:
data_3 = preprocessing().ReplaceMissing(data_2)

#### Preparing Feature abd label vectro for training and testing

In [13]:
train , test = DataSplit("Models","scaler").TrainTest(data_3)

In [14]:
X_train , y_train= DataSplit("Models","scaler").Featurelabel(train)
X_test , y_test = DataSplit("Models","scaler").Featurelabel(test)

In [15]:
x_train = DataSplit("Models","scaler").transferFeatureTrain(X_train)
x_test = DataSplit("Models","scaler").transferFeatureTest(X_test)

#### Training model as per following

In [16]:
trainingModel = Training(x_train,y_train,x_test,y_test)
trainingModel.modelTraining()

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


[LibLinear][LibLinear][LibLinear]convergence after 33 epochs took 0 seconds


[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 out of   1 | elapsed:    0.0s finished


'AUC SCORE = 0.6336699089084454......Accuracy = 0.7214659685863875'

#### Prediction of random featurevector as per given order of feture inside main data

In [17]:
predictModel = Prediction(x_train,y_train,x_test,y_test)
predictModel.RandomData(dta.iloc[3:4,:dta.shape[1]-2].values)

'You are safe! , No affairs'