This project focus on building a machine learning workflow that will run autonomously with the CSV file and return the best-performing model.

##### SkillsUsed : XGBoost, LogisticRegression, KNN, SVM, EDA

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

In [2]:
#Reading the datasets.

df1 = pd.read_csv('C:\Automation\TelcomCustomer-Churn_1.csv')
df2 = pd.read_csv('C:\Automation\TelcomCustomer-Churn_2.csv')
df3 = pd.merge(df1, df2, on="customerID")

#Dropping the last column considering it to be the target variable column in csv file.

df_original = df3.drop(df3.iloc[:,-1:],axis=1)

In [3]:
#We are defining a class and building main function in it.
#We will keep on building the required functions in main function.
#We will build the models as independent functions in the class which will not belong to main.

class Workflow:
  
  #Defining a initialization function for the dataframe.
  #df_all contains all the columns.
  #df contains all the columns except the target variable.

  def initialize(self,df3,df_original):
    global df_all
    df_all = df3
    global df
    df = df_original

  def main(self):

    #We are creating a function wherein we are removing the columns like ID or Index which will not have any impact in evaluation of models.
    #We have iterated through the columns and if the column endswith the following strings, we will remove those columns.

    def remove_cols(df):
      k = [x for x in df.columns if x.endswith("ID") | x.endswith("id") | x.endswith("Id") | x.endswith("iD") | x.endswith("Index") 
          | x.endswith("INDEX") | x.endswith("index") ] 
      for k in k:
        df.drop(k,inplace=True,axis=1)

    #We have defined a function data_types which will traverse through the values of each column in dataframe and if it finds 
    #values that can be converted to float(i.e. numbers or floats), it will convert them to float and the datatype will be updated to float.
    #This is very useful when we have raw data with columns having dtype as object or something else which is not a float/int, 
    #we can traverse through the columns and assign proper datatypes to the column with the correct updated values.
    #We have taken a measure if the column contains more than 80% of float/int values, then only we will convert it to float.
    #The reason is that it will ensure that the column has majority of float values and we are considering the column as float column.
    #We have defined conv_float function inside the data_types function which will return true if it sucessfully convert the value to float
    #or else false.

    def data_types(df):
      def conv_float(s):
        try:
          float(s)
          return True
        except ValueError:
          return False
      y=[]
      try:
        for x in df.columns:
          for z in df[x]:
            y.append(conv_float(z))
          if y.count(True)>(.08*len(df[x])):
            df[x]=df[x].apply(pd.to_numeric, errors='coerce')
          else:
            continue
          y=[]
      except:
        pass

    #We are defining a function remove_null which replaces the null values with mean for numerical values and median for categorical values.
    #We have taken the sum of null values in all the columns and placed them in a dictionary. We have iterated the dictionary and if there are columns 
    #whose sum is greater than 0, i.e., the column has null values, we have kept them in keys.
    #We have further classified the keys(columns) to numerical and categorical keys so that we can apply mean and median on the right columns.

    def remove_null(df):
      dicts = {}
      dicts = df.isnull().sum()
      keys = [key for key, value in dicts.items() if value>0]
      if len(keys)==0:
        return
      else:
        num_keys = [key for key in keys if (df[key].dtype=='int64') | (df[key].dtype==float)]
        cat_keys = [key for key in keys if (df[key].dtype==object)]
      if len(num_keys)!=0:
        for k in num_keys:
          df[k].fillna(df[k].mean(),inplace=True)
      if len(cat_keys)!=0:
        for k in cat_keys:
          df[k].fillna(df[k].median(),inplace=True)

    

    #We are defining a function standard for standarization which will transform the data with mean of 0 and standard deviation of 1.
    #We will use Z-Score method.
    #Extracting columns with int and float datatypes.
    #Standardizing the columns by subtracting the values with mean and then dividing by standard deviation.

    def standard(df):
      keys = [key for key in df.columns if (df[key].dtype==float) | (df[key].dtype=='int64')]
      for k in keys:
        df[k] = (df[k]-df[k].mean())/df[k].std()

    #We are defining a function called preprocessing which will call all the functions defined under main function.
  
    def preprocessing(df):
      remove_cols(df)
      data_types(df)
      remove_null(df)
      standard(df)
      
    #Calling the preprocessing function to make the dataset clean and ready for the models.

    preprocessing(df)

  #Now we will define different models.
  
  def xg_boost(self,df):
    #Importing the xgboost and XGBClassifier as we have labels in the target variables.
    #Importing LabelEncoder as xgboost understands numeric values and it helps in encoding the labels to numeric values.
    #Importing accuracy_score for the accuracy of the model.
    #We have used ravel() as the type of Y became a dataframe/column vector when we use iloc. 
    #The LabelEncoder was expecting either a Series/1d array, so we used ravel().
    #Taking a copy of the dataframe.

    df_c = df.copy()

    import xgboost as xgb
    from xgboost import XGBClassifier
    from sklearn.preprocessing import LabelEncoder
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    #Creating OneHotEncoding as xgboost uses numerical values.
    
    onehotcols = [cols for cols in df_c.columns if df_c[cols].dtype==object]
    df_c = pd.get_dummies(df_c, columns=onehotcols)

    #Splitting the data into X and Y

    X = df_c
    Y = df_all.iloc[:,-1:]

    #Creating an object of LabelEncoder and using the same to transform the target variable Y.
    
    Encoder = LabelEncoder()
    Encoder = Encoder.fit(Y.values.ravel())
    Encoder_Y = Encoder.transform(Y.values.ravel())
    X_train, X_test, Y_train, Y_test = train_test_split(X, Encoder_Y, test_size=.20, random_state=1)

    #Creating and Training a model of XGBClassifier.
    
    model = XGBClassifier()
    model.fit(X_train,Y_train)

    #Predicting from the model and returning the accuracy score.
    
    pred=model.predict(X_test)
    return accuracy_score(Y_test,pred)

  def log_reg(self,df):
    #Importing LogisticRegression.
    #Importing accuracy_score for the accuracy of the model.
    #We have used ravel() as the type of Y became a dataframe/column vector when we use iloc.
    #Taking a copy of the dataframe. 

    df_c = df.copy()

    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score

    #Creating OneHotEncoding.
    
    onehotcols = [cols for cols in df_c.columns if df_c[cols].dtype==object]
    df_c = pd.get_dummies(df_c, columns=onehotcols)

    #Splitting the data into X and Y
    
    X = df_c
    Y = df_all.iloc[:,-1:].values.ravel()
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20,random_state=1)

    #Creating and training a model of Logistic Regression.
    
    Model = LogisticRegression()
    Model.fit(X_train,Y_train)

    #Predicting the accuracy of the model.
    
    pred = Model.predict(X_test)
    return accuracy_score(Y_test,pred)
  
  def knn_classifier(self,df):
  
    #Importing KNeighborsClassifier.
    #Importing zscore for Z-Score normalization.
    #Importing accuracy_score for the accuracy of the model.
    #We have used ravel() as the type of Y became a dataframe/column vector when we use iloc.
    #Taking a copy of the dataframe.

    df_c = df.copy() 

    from sklearn.neighbors import KNeighborsClassifier
    from scipy.stats import zscore
    from sklearn.metrics import accuracy_score

    #Creating OneHotEncoding.
    
    onehotcols = [cols for cols in df_c.columns if df_c[cols].dtype==object]
    df_c = pd.get_dummies(df_c, columns=onehotcols)

    #Splitting the data into X and Y and applying zscore in X.
    
    X = df_c
    Y = df_all.iloc[:,-1:].values.ravel()
    X_Scaled = X.apply(zscore)
    X_train, X_test, Y_train, Y_test = train_test_split(X_Scaled,Y,test_size=0.20,random_state=1)

    #Creating and training a model of KNN.
    
    Model = KNeighborsClassifier(n_neighbors = 5)
    Model.fit(X_train,Y_train)

    #Predicting the accuracy of the model.
    
    pred = Model.predict(X_test)
    return accuracy_score(Y_test,pred)

  def svm_classifier(self,df):
    #Importing svm.
    #Importing accuracy_score for the accuracy of the model.
    #We have used ravel() as the type of Y became a dataframe/column vector when we use iloc.
    #Taking a copy of the dataframe.

    df_c = df.copy()

    from sklearn import svm
    from sklearn.metrics import accuracy_score

    #Creating OneHotEncoding.
    
    onehotcols = [cols for cols in df_c.columns if df_c[cols].dtype==object]
    df_c = pd.get_dummies(df_c, columns=onehotcols)

    #Splitting the data into X and Y.
    
    X = df_c
    Y = df_all.iloc[:,-1:].values.ravel()
    X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.20,random_state=1)

    #Creating and training a model of KNN.
    
    Model = svm.SVC(gamma=0.025,C=3)
    Model.fit(X_train,Y_train)

    #Predicting the accuracy of the model.
    
    pred = Model.predict(X_test)
    return accuracy_score(Y_test,pred)
  
  #We have defined a function called best_model which will call the different models we have defined.
  #We will append their accuracy_score in a list and compare them with max(list).
  #We will return the model with max accuracy_score to be the best model.

  def best_model(self):
    log = self.log_reg(df)
    xg = self.xg_boost(df)
    knn = self.knn_classifier(df)
    svm = self.svm_classifier(df)
    l = []
    l.append(log)
    l.append(xg)
    l.append(knn)
    l.append(svm)
    if max(l)==log:
      print("The best model for the given data is Logistic Regression with ",round(log*100,2),"% accuracy",sep='')
    if max(l)==xg:
      print("The best model for the given data is XGBoost with ",round(xg*100,2),"% accuracy",sep='')
    if max(l)==knn:
      print("The best model for the given data is KNN with ",round(knn*100,2),"% accuracy",sep='')
    if max(l)==svm:
      print("The best model for the given data is SVM with ",round(svm*100,2),"% accuracy",sep='')

In [4]:
#Creating an object of the class Workflow and passing the dataframes.
#Calling initialize function to initialize and main function.
#Calling the best_model function to get the best model.

ob = Workflow()
ob.initialize(df3,df_original)
ob.main()
ob.best_model()

The best model for the given data is Logistic Regression with 81.19% accuracy
