In [124]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn import preprocessing

from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC


from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score


In [111]:
class pred_analysis:
  def __init__(self):
    pass

  def details(self,df,target):                         #No return
    print("shape=",df.shape)
    print("\n")
    print("duplicates=")
    print(df[df.duplicated()].count())
    print("\n")
    print("Nulls=")
    print(df.isnull().sum())
    print("\n")
    print("balance=")
    print(df[target].value_counts())

  def balance(self,df,target):
    df1=df[df[target]==0]
    df2=df[df[target]==1]
    mini=min(df[target].value_counts().values)
    maxi=max(df[target].value_counts().values)
    if len(df1)>len(df2):
      df1_new=resample(df1,random_state=42,replace=True,n_samples=mini)
      df=pd.concat([df1_new,df2])
    else:
      df2_new=resample(df2,random_state=42,replace=True,n_samples=mini)
      df=pd.concat([df1,df2_new])
    return df

  def preprocess(self,df,target):
    if "Unnamed: 0" in df.columns:
      df=df.drop("Unnamed: 0",axis=1)
    df.fillna(method="ffill",inplace=True)
    df.drop_duplicates(inplace=True)

    return df

  def check_outliers(self,df):
    num_col=df.columns[df.dtypes!=object]
    for i in num_col:
      sns.boxplot(df[i])
      plt.title(i)
      plt.show()


  def remove_outliers(self,df):
    num_col=df.columns[df.dtypes!=object]
    for i in num_col:
      q1=df[i].quantile(0.25)
      q3=df[i].quantile(0.75)

      iqr=q3-q1
      med=df[i].median()
      df[i]=np.where((df[i]<q1-(1.5*iqr)) | (df[i]>q3+(1.5*iqr)),med,df[i])
    return df

  def encoder(self,df,lst):                                 #  No return
    for i in lst:
      df[i]=preprocessing.LabelEncoder().fit_transform(df[i])

  def data_split(self,df,target,size):
    X=df.drop(columns=[target])
    y=df[target]
    x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=size,random_state=42)
    print(f'Records in dataset: {df.shape[0]}')
    print(f'Records in train dataset: {x_train.shape[0]}')
    print(f'Records in test dataset: {x_test.shape[0]}')
    return (x_train,x_test,y_train,y_test)

  def metrics(self,y_test,res):
    print(f"Accuracy score={accuracy_score(y_test,res)*100}")
    print(f"precision={precision_score(y_test,res)}")
    print(f"Recall={recall_score(y_test,res)*100}")
    print(f"F1 score={f1_score(y_test,res)*100}")
    print(f"Confusion Matrix=")
    conf_matrix=confusion_matrix(y_test,res)
    sns.heatmap(conf_matrix,annot=True)
    plt.show()

  def model_execute(self,models,tup):
    x_train,x_test,y_train,y_test=tup
    for mod in models:
      clf=models[mod]
      clf.fit(x_train,y_train)
      res=clf.predict(x_test)
      print(f"{clf}")
      print("\n")
      self.metrics(y_test,res)

In [None]:
dec=DecisionTreeClassifier()
models={"Logreg":LogisticRegression(),"Rf":RandomForestClassifier(),"dt":dec,"knn":KNeighborsClassifier(),"Naive Bayes": GaussianNB(),
        "ada":AdaBoostClassifier(base_estimator=dec,n_estimators=50,random_state=42),"xgb":XGBClassifier()}