In [59]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier

In [60]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [69]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto', probability=True),
        'params': {
            'svc__C': [1, 10, 100, 1000],
            'svc__kernel': ['rbf', 'linear']
        }
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params': {
            'randomforestclassifier__n_estimators': [1, 5, 10]
        }
    },
    'logistic_regression': {
        'model': LogisticRegression(solver='liblinear', multi_class='auto'),
        'params': {
            'logisticregression__C': [1, 5, 10]
        }
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'decisiontreeclassifier__max_depth': [None, 5, 10, 20],
            'decisiontreeclassifier__min_samples_split': [2, 5, 10],
            'decisiontreeclassifier__min_samples_leaf': [1, 2, 4]
        }
    }
}

In [84]:
def model_compare(X_train,y_train):
    scores = []
    best_estimators = {}
    import pandas as pd
    for algo, mp in model_params.items():
        pipe = make_pipeline(StandardScaler(), mp['model'])
        clf =  GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
        clf.fit(X_train, y_train)
        scores.append({
            'model': algo,
            'best_score': clf.best_score_,
            'best_params': clf.best_params_
        })
        best_estimators[algo] = clf.best_estimator_
    models = pd.DataFrame(scores,columns=['model','best_score','best_params'])
    return best_estimators,models

In [85]:
def makePrediction(csv, userInputs):
    # Read the csv file
    df = pd.read_csv(csv)

    # Drop rows with NaN values in the target variable
    df = df.dropna(subset=[df.columns[-1]])

    # Getting Independent features
    x_cols = df.iloc[:, :-1]

    # Handling missing data for numerical columns
    numerical_columns = x_cols.select_dtypes(include=['int64', 'float64'])
    for column in numerical_columns.columns:
        x_cols.loc[:, column].fillna(x_cols[column].mean(), inplace=True)

    # Handling missing data and encoding categorical columns
    categorical_columns = x_cols.select_dtypes(include=['object'])
    for column in categorical_columns.columns:
        x_cols.loc[:, column].fillna(x_cols[column].mode().iloc[0], inplace=True)
        x_cols = pd.get_dummies(x_cols, columns=[column])

    # Encode boolean columns
    x_cols = x_cols.replace({True: 1, False: 0})

    # Feature Scaling
    sc = StandardScaler()
    x_cols_scaled = sc.fit_transform(x_cols)

    # Model Training (Decision Tree)
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(x_cols_scaled, df.iloc[:, -1])
    
    # Preprocessing the user input
    user_inputs_dict = {}
    for i, col in enumerate(df.columns[:-1]):
        user_inputs_dict[col] = float(userInputs[i]) if i < len(userInputs) else 0
    
    user_inputs_df = pd.DataFrame([user_inputs_dict])
    user_inputs_scaled = sc.transform(user_inputs_df)

    # Predicting the result
    prediction = classifier.predict(user_inputs_scaled)
    best_estimators,models = model_compare(x_cols_scaled, df.iloc[:, -1])
    
    return best_estimators,models,prediction[0]

In [86]:
csv = pd.read_csv("bank-loan.csv")
csv.head()

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1.0
1,27,1,10,6,31,17.3,1.362202,4.000798,0.0
2,40,1,15,14,55,5.5,0.856075,2.168925,0.0
3,41,1,15,14,120,2.9,2.65872,0.82128,0.0
4,24,2,2,0,28,17.3,1.787436,3.056564,1.0


In [87]:
# Bank Loan Dataset : 1 - approved, 0 - Not Approved
user_input = ["41", "3", "17", "12", "176", "9.3", "11.35", "5.00"]
best_estimators,models,prediction = makePrediction("bank-loan.csv", user_input)
print("Prediction:", prediction)

Prediction: 1.0


In [88]:
models

Unnamed: 0,model,best_score,best_params
0,svm,0.808571,"{'svc__C': 10, 'svc__kernel': 'linear'}"
1,random_forest,0.782857,{'randomforestclassifier__n_estimators': 10}
2,logistic_regression,0.81,{'logisticregression__C': 5}
3,decision_tree,0.76,"{'decisiontreeclassifier__max_depth': 5, 'deci..."


In [89]:
best_estimators

{'svm': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('svc',
                  SVC(C=10, gamma='auto', kernel='linear', probability=True))]),
 'random_forest': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('randomforestclassifier',
                  RandomForestClassifier(n_estimators=10))]),
 'logistic_regression': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('logisticregression',
                  LogisticRegression(C=5, solver='liblinear'))]),
 'decision_tree': Pipeline(steps=[('standardscaler', StandardScaler()),
                 ('decisiontreeclassifier',
                  DecisionTreeClassifier(max_depth=5, min_samples_leaf=2))])}

In [None]:
# best_estimators['svm'].score(X_test,y_test)

In [90]:
# Iris Dataset
user_input = ["5.9","3","5.1","1.8"]
best_estimators,models,prediction = makePrediction("iris.csv", user_input)
print("Prediction:", prediction)

Prediction: Virginica


In [91]:
models

Unnamed: 0,model,best_score,best_params
0,svm,0.973333,"{'svc__C': 10, 'svc__kernel': 'rbf'}"
1,random_forest,0.966667,{'randomforestclassifier__n_estimators': 5}
2,logistic_regression,0.94,{'logisticregression__C': 10}
3,decision_tree,0.966667,"{'decisiontreeclassifier__max_depth': None, 'd..."


In [92]:
# Diabetes Dataset : 1 - Diabetes , 0 - Not Diabetes
user_input = ["3","78","50","32","88","31","0.248","26"]
best_estimators,models,prediction = makePrediction("diabetes.csv", user_input)
print("Prediction:", prediction)

Prediction: 1


In [93]:
models

Unnamed: 0,model,best_score,best_params
0,svm,0.773483,"{'svc__C': 1, 'svc__kernel': 'linear'}"
1,random_forest,0.744818,{'randomforestclassifier__n_estimators': 10}
2,logistic_regression,0.770885,{'logisticregression__C': 1}
3,decision_tree,0.751379,"{'decisiontreeclassifier__max_depth': 5, 'deci..."
