# **Auto Learn**

I started this project to showcase a simple way for beginner data scientists to create machine learning models and data visualizations automatically without having to scratch their heads for large amounts of time.


### Importing All the necessary Libraries to use

In [1]:
#Importing all the needed Libraries
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
#This notebook is running a version of plotly version 4.14.3.
## Make sure you have this version installed to have it working nicely
import plotly
import plotly.graph_objects as go
print(plotly.__version__)
from pylab import rcParams
rcParams['figure.figsize'] = 12, 6
plt.rcParams['xtick.labelsize']= 10
plt.rcParams['ytick.labelsize']= 10
import seaborn as sns
%matplotlib inline

4.14.3


In [2]:
class Data_Quality_Report(object):
    
    def __init__(self, data, threshold):
        self.threshold = threshold
        self.data= data
        
    def Categorical_Detector(self):
        df= self.data.copy()
        category_features = []
        for name in self.data.columns.to_list():
            if df[name].nunique() <= self.threshold:
                    category_features.append(name)
            
            elif self.data[name].dtype== 'object':
                    category_features.append(name)
            
        dataframe= []
        for each in category_features:
            df[each] = df[each].astype('category')

        return df
        
    def OUTLIER_DETECTOR(self):
        data= self.data
        df= data.select_dtypes(include= ['float64', 'int64'])
        columns= df.columns.to_list()
        frame= pd.DataFrame([])
        for name in columns:
            frequency= data[data[name]== data[name].mode()[0]][name].count()
            common_value= data[name].mode()[0].astype('int64')
            unique= df[name].nunique()
            Q1 = df[name].quantile(0.25)
            Q3 = df[name].quantile(0.75)
            IQR = Q3 - Q1
            num_outliers= ((df[name] < (Q1 - 1.5 * IQR)) | (df[name] > (Q3 + 1.5 * IQR))).sum().astype('int64')
            frame= frame.append(pd.DataFrame({'Feature name': [name],'Unique': [unique], 'Outliers': [num_outliers], 'Common_value': [common_value], 'Frequency': [frequency]}))
        return frame

    def QUALITY_DATA_ASSESSMENT(self):
        data= self.data
        data= self.Categorical_Detector()
    
        df= data.describe(include= 'all').T
        df= df.reset_index()
        df['Variable type']= df['index'].apply(lambda x: data[x].dtype)
        df['Missing']= df['index'].apply(lambda x: data[x].isnull().sum())
        df['Missing Percentage']= df['Missing'].apply(lambda x: round(x/len(data)*100,2))
        df= df.reset_index()
        df.columns= ['Index','Feature name', 'Records', 'unique', 'Commom Value', 'frequency', 'Mean', 'Std Dev', 'Minimum', '25th Quartile', 'Median', '75th Quartile', 'Maximum', 'Variable type', 'Missing', 'Missing Percentage']
        
        df1= self.OUTLIER_DETECTOR()
        frame= pd.merge(df,df1, how= 'left', on= 'Feature name')
    
        frame= frame[['Feature name','Index','Variable type','Records', 'Unique', 'Missing','Missing Percentage',
                  'Common_value', 'Frequency', 'Outliers', 'Mean', 'Std Dev', 'Minimum', '25th Quartile',
                  'Median', '75th Quartile', 'Maximum']]

        frame= frame.sort_values(by=['Unique'], ascending= False)
    
    
        cat_data= frame[frame['Variable type'] == 'category']
        num_data= frame[frame['Variable type'] != 'category']
    
        categorical= data[cat_data['Feature name'].to_list()].describe(include= 'all').T
        categorical.reset_index(inplace= True)
    
        categorical['Var type']= categorical['index'].apply(lambda x: data[x].dtype)
        categorical['Missing']= categorical['index'].apply(lambda x: data[x].isnull().sum())
        categorical['Missing Percentage']= categorical['Missing'].apply(lambda x: round(x/len(data)*100,2))
        categorical.reset_index(inplace= True)
    
        categorical.columns= ['Index','Feature name','Records', 'Unique','Common_value', 'Frequency', 'Variable type','Missing', 'Missing Percentage']
        categorical= categorical[['Feature name','Index', 'Variable type','Records', 'Unique','Missing', 'Missing Percentage','Common_value', 'Frequency']]
        categorical.sort_values(by= ['Missing'])
    
        final= num_data.append(categorical, sort= True)
        final.sort_values(by= ['Unique'], ascending= False)
        final.replace(np.NaN,'*', inplace=True)
        
        final= final[['Feature name','Variable type','Records', 'Unique', 'Missing','Missing Percentage',
                  'Common_value', 'Frequency', 'Outliers', 'Mean', 'Std Dev', 'Minimum', '25th Quartile',
                  'Median', '75th Quartile', 'Maximum']]
                
        name= "*"
        print(f" Note that {name} indicate that no calculation record can be found for that field")
        return final
    
    def Impute_Missing_data(self, path= None):
        data= self.data
        threshold= self.threshold
        imputation= pd.read_excel(path)
        imputation.drop(['LEGEND'], axis= 1, inplace= True)
        data= data[imputation['Feature name'].to_list()]
        for name in data.columns.to_list():
            if imputation[imputation['Feature name']== name]['IMPUTE NULLS'][imputation[imputation['Feature name']== name].index[0]]== 'YES':
                value= imputation[imputation['Feature name']== name]['METHOD'][imputation[imputation['Feature name']== name].index[0]]
                data[name].fillna(value, inplace= True)
        return data

In [3]:
class Explorer(object):
    ## figure out how to auto import or autoinstall packages that are needed
    def __init__(self, data):
        self.data = data

    def completeness(self):
        df = self.data.copy()
        score = 100 - df.isnull().sum().sum() / (df.size) * 100
        score = round(score)
        if score <= 50:
            color = 'darkred'
            backrgound = 'red'
        elif score in range(50, 80):
            color = 'orange'
            backrgound = 'orangered'
        elif score >= 80:
            color = 'rgb(112,130,56)'
            backrgound = 'darkseagreen'
        fig = go.Figure(
            go.Indicator(mode="gauge+number",
                         value=score,
                         domain={
                             'x': [0, 1],
                             'y': [0, 1]
                         },
                         title={'text': "Data Completeness Score"},
                         gauge={
                             'bar': {
                                 'color': color
                             },
                             'axis': {
                                 'range': [None, 100],
                                 'tickwidth': 1,
                                 'tickcolor': "darkblue"
                             },
                             'steps': [{
                                 'range': [0, 100]
                             }, {
                                 'range': [0, 100],
                                 'color': backrgound
                             }]
                         }))
        return fig

    def Missing_Data(self):
        df = self.data
        missing = df.isnull().sum() / df.shape[0] * 100
        plt.subplots(figsize=(15, 7))
        plt.title("Percentage of Missing Data")
        plot = missing.sort_values().plot(kind='bar')
        plt.xlabel('Feature')
        plt.ylabel("% of Missing Data")
        for p in plot.patches:
            if p.get_height() == 0:
                pass
            else:
                plot.annotate(format(p.get_height(), '.1f'),
                              (p.get_x() + p.get_width() / 2., p.get_height()),
                              ha='center',
                              va='center',
                              color='white',
                              xytext=(0, -12),
                              rotation=90,
                              textcoords='offset points')
        return plt.show()
    
    def outliers_analysis(self):
        df = self.data
        IQR = df.quantile(0.75) - df.quantile(0.25)
        return IQR
    
    def remove_outlier(self):
        df = self.data
        SKEW_BEFORE=[]
        SKEW_AFTER=[]
        FEATURES=[]
        outliers= self.outliers_analysis()
        columns = outliers.keys().to_list()
        for col in columns:
            FEATURES.append(col)
            SKEW_BEFORE.append(df[col].skew())
            lower = df[col].quantile(0.10)
            upper = df[col].quantile(0.90)
            ##Treating The Outliers
            df[col]= np.where(df[col]<lower,lower,df[col])
            df[col]= np.where(df[col]>upper,upper,df[col])
            SKEW_AFTER.append(df[col].skew())
        return pd.DataFrame({"Feature":FEATURES,"skew_before":SKEW_BEFORE,"skew_after":SKEW_AFTER})
        
        
    def feature_removal(self):
        df =self.data
        for column in df.columns:
            if df[column].isnull().sum()/len(df)>0.75:
                df.drop(column,axis=1,inplace=True)
    
    def correlative(self):
        ## NB: AUTO IMPORT SEABORN, MATPLOTLIB,PLOTLY'S GRAPH OBJECTS OR GIVE AN ERROR OUTPUT
        df = self.data
        corr = df.corr(method='spearman')
        plt.subplots(figsize=(15,7))
        sns.heatmap(corr, cmap='Greens')
        return plt.show()
    
    def correlative_features(self):
        data= self.data
        correlated_features = set()
        correlation_matrix = data.corr(method='spearman')
        for i in range(len(correlation_matrix.columns)):
            for j in range(i):
                if abs(correlation_matrix.iloc[i, j])*100 > 60:
                    colname = correlation_matrix.columns[i]
                    correlated_features.add(colname)
        return data[list(correlated_features)].corr(method='spearman')

### Data Feature Detection

In [4]:
def autoplot(df, feature,threshold=8):
    plt.subplots(figsize=(14,7))
    if df[feature].nunique()<=threshold:
        plot =sns.countplot(x=feature,data=df)
        for bar in plot.patches:
            plot.annotate(round(bar.get_height()),(bar.get_x()+bar.get_width()/2,bar.get_height()), ha='center', va='center', size=15,xytext=(0,8),textcoords='offset points')
    elif pd.api.types.is_float_dtype(df[feature].dtype):
        sns.distplot(df[feature])
        plt.title(f'Distribution for {feature}')
        plt.xlabel('Bins')
    elif pd.api.types.is_integer_dtype(df[feature].dtype):
        sns.distplot(df[feature])
        plt.title(f'distribution for for {feature}')
    
        

In [11]:
class AutoClassifier:
    """
    This module helps in fitting to all the classification algorithms that are available in Scikit-learn
    Parameters
    ----------
    verbose : int, optional (default=0)
        For the liblinear and lbfgs solvers set verbose to any positive
        number for verbosity.
    ignore_warnings : bool, optional (default=True)
        When set to True, the warning related to algorigms that are not able to run are ignored.
    Examples
    --------
    """
    ###NOTES:
    ## Add an arguement for stratified train test split
    ## Add an arguement for the test size while keeping the default at 30%
    ## add a threshold for sampling the data if the data is large
    ## add an stratification arguement
    ## Side note>> Cross validation might also be useful
    ## Bring Back top number: default is 5 and their validation scores.
    ## Add a sampling arguement for when the data is too large
    ## Add a balanced arguement for data... Boolean
    
    def __init__(self,data,Target:str, stratified=True,test_split=0.3,ignore_warnings=True,verbose=0):
        self.data =data
        self.Target =Target
        self.stratified =stratified
        self.test_split= test_split
        self.ignore_warnings = ignore_warnings
        self.verbose =verbose
        
    def classifer_load(self):
        
        try:
            from sklearn.utils import all_estimators
            from sklearn.base import ClassifierMixin
            import sklearn
        except:
            print("please install sklearn Library To continue")
        #Loading all classifiers from Sklearn
        CLASSIFIERS = [est for est in all_estimators() if issubclass(est[1], ClassifierMixin)]
        
        try:
            from lightgbm import LGBMClassifier
        except:
            print("you need install lightgbm to continue")
        
        CLASSIFIERS.append(('LGBMClassifier',LGBMClassifier))
        
        try:
            from xgboost import XGBClassifier
        except:
            print("you need to install XGboost to continue")
        
        CLASSIFIERS.append(('XGBClassifier',XGBClassifier))
        
        ##Removing the Gaussian Process Classifier
        CLASSIFIERS.pop(CLASSIFIERS.index(('GaussianProcessClassifier',sklearn.gaussian_process.GaussianProcessClassifier)))
        CLASSIFIERS.pop(CLASSIFIERS.index(('ClassifierChain',sklearn.multioutput.ClassifierChain)))
        CLASSIFIERS.pop(CLASSIFIERS.index(('MultiOutputClassifier',sklearn.multioutput.MultiOutputClassifier)))
        CLASSIFIERS.pop(CLASSIFIERS.index(('OneVsOneClassifier',sklearn.multiclass.OneVsOneClassifier)))
        CLASSIFIERS.pop(CLASSIFIERS.index(('OneVsRestClassifier',sklearn.multiclass.OneVsRestClassifier)))
        CLASSIFIERS.pop(CLASSIFIERS.index(('OutputCodeClassifier',sklearn.multiclass.OutputCodeClassifier)))
        CLASSIFIERS.pop(CLASSIFIERS.index(('StackingClassifier',sklearn.ensemble.StackingClassifier)))
        CLASSIFIERS.pop(CLASSIFIERS.index(('VotingClassifier',sklearn.ensemble.VotingClassifier)))
        
        
        return CLASSIFIERS
    
    def datasplit(self):
        df = self.data
        if df.shape[0]> 100000:
            
            df = df.sample(n=100000)
            
            X = df.drop(self.Target,axis=1)
            y=df[self.Target]
            
            return X,y
        else:
            
            X = df.drop(self.Target,axis=1)
            y=df[self.Target]
            
            return X,y
    
    def train_split(self):
        X, y = self.datasplit()
        try:
            from sklearn.model_selection import train_test_split
        except:
            print("you need to install sklearn to continue")
        
        if self.stratified==True:
            return train_test_split(X, y, test_size=self.test_split,stratify=y, random_state=101)
        else:
            return train_test_split(X, y, test_size=self.test_split, random_state=101)

    def numeric_transformer(self):
        try:
            from sklearn.preprocessing import StandardScaler
            from sklearn.pipeline import Pipeline
            from sklearn.impute import SimpleImputer
        except:
            print("you need to install sklearn to continue")
        
        return Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),('scaler', StandardScaler())])
    
    def categorical_transformer(self):
        try:
            from sklearn.preprocessing import OneHotEncoder
            from sklearn.pipeline import Pipeline
            from sklearn.impute import SimpleImputer
        except:
            print("you need to install sklearn")
            
        return Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('encoding', OneHotEncoder(handle_unknown='ignore', sparse=False))])
    
    def fit(self):
        
        try:
            import numpy as np
            import pandas as pd
            from sklearn.metrics import accuracy_score, balanced_accuracy_score, fbeta_score, f1_score, roc_auc_score
            from tqdm import tqdm
            import logging as log
            import warnings
            warnings.filterwarnings('ignore')
        except:
            print("You have to install numpy and pandas to continue")
            
        ACCURACY = []
        BALANCED_ACCURACY = []
        ROC_AUC = []
        GINI=[]
        F1 = []
        FBETA =[]
        LOG_LOSS=[]
        names = []
        
        X_train, X_test, y_train, y_test =self.train_split()
        
        if type(X_train) is np.ndarray:
            X_train = pd.DataFrame(X_train)
            X_test = pd.DataFrame(X_test)
            
        numeric_features = X_train.select_dtypes(include=[np.number]).columns
        categorical_features = X_train.select_dtypes(include=['object']).columns
        try:
            from sklearn.compose import ColumnTransformer
        except:
            print("you need to install sklearn to move forward")
        preprocessor = ColumnTransformer(
            transformers=[
                ('numeric', self.numeric_transformer(), numeric_features),
                ('categorical', self.categorical_transformer(), categorical_features)
            ])
        classifers = self.classifer_load()
        
        for name, model in tqdm(classifers):
            try:
                try:
                    from sklearn.pipeline import Pipeline
                except:
                    print("you need to install sklearn to continue")
                
                
                pipe = Pipeline(steps=[('preprocessor', preprocessor),('classifier', model())])
                if (name =="CategoricalNB") or (name=="ComplementNB") or (name=="MultinomialNB"):
                    pipe.fit(abs(X_train),y_train)
                    
                    y_pred = pipe.predict(X_test)
                    
                    accuracy = accuracy_score(y_test, y_pred, normalize=True)
                    
                    b_accuracy = balanced_accuracy_score(y_test, y_pred)
                    
                    f1 = f1_score(y_test, y_pred, average='weighted')
                    
                    fbeta = fbeta_score(y_test,y_pred,beta=0.5)
                    
                    try:
                        roc_auc = roc_auc_score(y_test, y_pred)
                        
                    except Exception as exception:
                        roc_auc = None
                        if self.ignore_warnings == False:
                            print("ROC AUC couldn't be calculated for "+name)
                            print(exception)
                    try:
                        gini = (2*roc_auc-1)*100
                    except Exception as exception:
                        gini=None
                        if self.ignore_warnings==False:
                            print("GINI % couldn't be calcuated for",name)
                            print(exception)
                    names.append(name)
                    ACCURACY.append(accuracy)
                    BALANCED_ACCURACY.append(b_accuracy)
                    ROC_AUC.append(roc_auc)
                    GINI.append(gini)
                    F1.append(f1)
                    FBETA.append(fbeta)
                    
                    if self.verbose > 0:
                        print({"Model": name,"Accuracy": accuracy,"Balanced Accuracy": b_accuracy,"F Beta": fbeta,"F1 Score": f1,"ROC AUC": roc_auc,"Gini Coefficient":gini})
                else:
                    pipe.fit(X_train, y_train)
                    
                    y_pred = pipe.predict(X_test)
                    
                    accuracy = accuracy_score(y_test, y_pred, normalize=True)
                    
                    b_accuracy = balanced_accuracy_score(y_test, y_pred)
                    
                    f1 = f1_score(y_test, y_pred, average='weighted')
                    
                    fbeta = fbeta_score(y_test,y_pred,beta=0.5)
                    try:
                        roc_auc = roc_auc_score(y_test, y_pred)
                        
                    except Exception as exception:
                        roc_auc = None
                        if self.ignore_warnings == False:
                            print("ROC AUC couldn't be calculated for "+name)
                            print(exception)
                    try:
                        gini = (2*roc_auc-1)*100
                    except Exception as exception:
                        gini=None
                        if self.ignore_warnings==False:
                            print("GINI % couldn't be calcuated for",name)
                            print(exception)
                    names.append(name)
                    ACCURACY.append(accuracy)
                    BALANCED_ACCURACY.append(b_accuracy)
                    ROC_AUC.append(roc_auc)
                    GINI.append(gini)
                    F1.append(f1)
                    FBETA.append(fbeta)
                    
                    if self.verbose > 0:
                        print({"Model": name,"Accuracy": accuracy,"Balanced Accuracy": b_accuracy,"F Beta": fbeta,"F1 Score": f1,"ROC AUC": roc_auc,"Gini Coefficient":gini})
            except Exception as exception:
                if self.ignore_warnings == False:
                    print(name + " model failed to execute")
                    print(exception)
        scores = pd.DataFrame({"Model": names,
                               "Accuracy": ACCURACY,
                               "Balanced Accuracy": BALANCED_ACCURACY,
                               "F1 Score": F1,
                               "F Beta":FBETA,
                               "ROC AUC": ROC_AUC,
                               "GINI Coefficient":GINI})
        scores = scores.sort_values(
            by=['Accuracy','GINI Coefficient'], ascending=False).set_index('Model')
        return scores
    
    def cross_validation(self):
        return "Coming Sooon....."

In [3]:
train=pd.read_csv("wine.csv")

In [4]:
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,bad
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,bad
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,good
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,bad


In [5]:
train['quality'].replace('bad',0,inplace=True)
train['quality'].replace('good',1,inplace=True)
train.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [6]:
train.shape

(1599, 12)

In [12]:
model= AutoClassifier(data=train,Target='quality', test_split=0.1)

In [13]:
model.fit()

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:10<00:00,  3.40it/s]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,F1 Score,F Beta,ROC AUC,GINI Coefficient
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ExtraTreesClassifier,0.85625,0.854965,0.856177,0.864055,0.854965,70.993086
XGBClassifier,0.84375,0.840509,0.843225,0.844444,0.840509,68.101823
HistGradientBoostingClassifier,0.81875,0.81631,0.818429,0.825792,0.81631,63.262099
RandomForestClassifier,0.80625,0.805625,0.806334,0.821596,0.805625,61.125079
LGBMClassifier,0.80625,0.804683,0.806151,0.817972,0.804683,60.936518
NuSVC,0.8,0.799811,0.800157,0.817536,0.799811,59.962288
BaggingClassifier,0.78125,0.78237,0.78155,0.804878,0.78237,56.473916
LabelPropagation,0.78125,0.780484,0.781344,0.798122,0.780484,56.096794
LabelSpreading,0.78125,0.780484,0.781344,0.798122,0.780484,56.096794
SVC,0.78125,0.779541,0.781138,0.794931,0.779541,55.908234


In [50]:
model.cross_validation()

'Coming Sooon.....'

In [14]:
df = pd.read_csv('train.csv')

In [15]:
df = df.sample(n=10000)
df.shape

(10000, 41)

In [16]:
df.loan_default.value_counts()/len(df) *100

0    78.92
1    21.08
Name: loan_default, dtype: float64

In [17]:
modeller = AutoClassifier(data=df, Target='loan_default',test_split=0.2)

In [18]:
modeller.fit()

100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [35:35<00:00, 61.01s/it]


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,F1 Score,F Beta,ROC AUC,GINI Coefficient
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BernoulliNB,0.789,0.505208,0.701464,0.06383,0.505208,1.041573
CalibratedClassifierCV,0.789,0.5,0.695943,0.0,0.5,0.0
DummyClassifier,0.789,0.5,0.695943,0.0,0.5,0.0
SVC,0.789,0.5,0.695943,0.0,0.5,0.0
LogisticRegressionCV,0.7885,0.499683,0.695696,0.0,0.499683,-0.063371
RandomForestClassifier,0.788,0.501102,0.697317,0.022422,0.501102,0.220448
ExtraTreesClassifier,0.7875,0.504257,0.700681,0.062241,0.504257,0.851459
GradientBoostingClassifier,0.7875,0.499917,0.696139,0.011312,0.499917,-0.016519
AdaBoostClassifier,0.787,0.50828,0.704735,0.104563,0.50828,1.656065
RidgeClassifierCV,0.7855,0.499518,0.696057,0.021459,0.499518,-0.096409
