In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error , r2_score , mean_absolute_error , accuracy_score , precision_score , recall_score
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor

In [None]:
def load_data():
    file_path = input("Enter the path to the data file: ")
    if file_path.endswith('.csv'):
        df = pd.read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        df = pd.read_excel(file_path)
    else:
        raise ValueError(f"Unsupported file type: {file_path}")
    return df

In [None]:
# Data preprocessing
def data_preprocessing(df) :
    na_cols = [col for col in df.columns if df[col].isna().any()]
    
    for col in na_cols:
        if df[col].dtype in ['int64', 'float64']:
            mean_value = df[col].mean()
            df[col].fillna(mean_value, inplace=True)
        else:
            mode_value = df[col].mode()
            df[col].fillna(mode_value, inplace=True)

    # Encoding
    le = LabelEncoder()
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = le.fit_transform(df[col])
    return df

In [None]:
def data_info(df):
    # Display the shape of the dataset
    print(f"==> Number of rows: {df.shape[0]}")
    print(f"==> Number of columns: {df.shape[1]}")
        
    # Display summary for numerical columns
    print("==> Data Summary:\n")
    print(df.describe())

In [None]:
def choose_features(df):
    print('\n==> Columns In Dataset:')
    print(df.columns)
    print('*********************************')
    
    while True:
        x_input = input('Please choose columns to assign to x (comma separated): ')
        features = [col.strip() for col in x_input.split(',')]
        if all(col in df.columns for col in features):
            break
        else:
            print('Invalid columns, please try again')
    
    while True:
        target = input('Please choose a column to assign to y: ')
        if target not in df.columns :
            print('Invalid column, please try again')
        else:
            break
        
    return features, target

In [None]:
def model_and_algorithm(x, y, df):
    while True:
        try:
            model = input('Please choose machine learning model (regression/classification): ').lower()
            if model == 'regression':
                print('#############################################################################')
                print('These are the main algorithms in regression:')
                print('1.Linear Regression \n2.Polynomial_Features\n3.Regularization\n4.Gradient_Descent\n5.Multilinear Regression')
                break
            elif model == 'classification':
                print('#############################################################################')
                print('These are the main algorithms in classification:')
                print('1.Logistic Regression\n2.Support Vector Machine\n3.K Nearest Neighbor (KNN)\n4.Decision Tree\n5.Random Forest')
                break
            else:
                print('Wrong type, please choose a valid model')
        except Exception as e:
            print(f'Error: {e}')

    while True:
        try:
            algorithm = int(input(f'Please type the number of the {model} algorithm you want to apply: '))
            if model == 'regression':
                reg = Regression(x, y, df)
                if algorithm == 1:
                    reg.linear_regression()
                elif algorithm == 2:
                    reg.polynomial_features()
                elif algorithm == 3:
                    reg.Regularization()
                elif algorithm == 4:
                    reg.gradient_descent()
                elif algorithm == 5:
                    reg.multilinear_regression()
                else:
                    print('Invalid number, please try again')
                    continue
                return  
            elif model == 'classification':
                cls = Classification(df, y)
                if algorithm == 1:
                    cls.logistic_regression()
                elif algorithm == 2:
                    cls.support_vector_machine()
                elif algorithm == 3:
                    cls.k_nearest_neighbor()
                elif algorithm == 4:
                    cls.decision_tree()
                elif algorithm == 5:
                    cls.random_forest()

                else:
                    print('Invalid number, please try again')
                    continue
                return  
            else:
                print('Invalid model, please try again.')
        except ValueError:
            print('Invalid input, please enter a valid number')
        except Exception as e:
            print(f'Error: {e}')

In [None]:
class Regression:
    def __init__(self, x, y, df):
        self.df = df
        self.x = [col if isinstance(col, str) else str(col) for col in x]
        self.y = self.df[y]

    def linear_regression(self):
        print('Linear Regression Algorithm')
        print('*********************************')
        for col in self.x:
            x_features = self.df[col].values.reshape(-1, 1)
            lr = LinearRegression()
            lr.fit(x_features, self.y)
            print(f'Results for {col}:')
            print('*********************************')
            print(f'Coefficients = {lr.coef_}')
            print(f'Intercept = {lr.intercept_}')
            prds = lr.predict(x_features)
            plt.scatter(x_features, self.y)
            plt.plot(x_features, prds, color='r')
            mean_sq = mean_squared_error(self.y, prds)
            r2 = r2_score(self.y, prds)
            mean_abs = mean_absolute_error(self.y, prds)
            print(f'Mean Squared Error = {mean_sq}')
            print(f'R^2 Score = {r2}')
            print(f'Mean Absolute Error = {mean_abs}')
            print('*********************************')

    def polynomial_features(self):
        print('Polynomial Features Algorithm')
        print('*********************************')
        for col in self.x:
            x_features = self.df[col].values.reshape(-1, 1)
            poly = PolynomialFeatures(degree=3)
            x_poly = poly.fit_transform(x_features)
            scale = StandardScaler()
            x_scale = scale.fit_transform(x_poly)
            lr = LinearRegression()
            lr.fit(x_scale, self.y)
            prds = lr.predict(x_scale)
            print(f'Results for {col}:')
            print('*********************************')
            print(f'Coefficients = {lr.coef_}')
            print(f'Intercept = {lr.intercept_}')
            print('R^2 Score before poly:', r2_score(self.y, prds))
            lr_poly = LinearRegression()
            lr_poly.fit(x_scale, self.y)
            preds_poly = lr_poly.predict(x_scale)
            print('R^2 Score after poly:', r2_score(self.y, preds_poly))
            plt.scatter(x_features, self.y)
            plt.plot(x_features, prds, color='g')
            plt.plot(x_features, preds_poly, color='r')
            plt.show()
            mean_sq = mean_squared_error(self.y, prds)
            r2 = r2_score(self.y, prds)
            mean_abs = mean_absolute_error(self.y, prds)
            print(f'Mean Squared Error = {mean_sq}')
            print(f'R^2 Score = {r2}')
            print(f'Mean Absolute Error = {mean_abs}')
            print('*********************************')

    def Regularization(self):
        print('Regularization Algorithm')
        print('*********************************')
        print('Regularization By Lasso')
        print('*********************************')
        for alpha in [0.00001, 0.0001, 0.001, 0.01, 0.1, 0, 1, 10, 100, 1000]:
            for col in self.x:
                x_features = self.df[col].values.reshape(-1, 1)
                poly = PolynomialFeatures(degree=15)
                x_poly = poly.fit_transform(x_features)
                scale = StandardScaler()
                x_scale = scale.fit_transform(x_poly)
                lr_poly = Lasso(alpha=alpha)
                lr_poly.fit(x_scale, self.y)
                preds_poly = lr_poly.predict(x_scale)
                print(f'Results for column {col} with alpha = {alpha}')
                print('R^2 Score after poly:', r2_score(self.y, preds_poly))
                plt.scatter(x_features, self.y)
                plt.plot(x_features, preds_poly, color='r')
                plt.show()
                print('*********************************')

        print('Regularization By Ridge ')
        print('*********************************')
        for alpha in [0.00001, 0.0001, 0.001, 0.01, 0.1, 0, 1, 10, 100, 1000]:
            for col in self.x:
                x_features = self.df[col].values.reshape(-1, 1)
                poly = PolynomialFeatures(degree=15)
                x_poly = poly.fit_transform(x_features)
                scale = StandardScaler()
                x_scale = scale.fit_transform(x_poly)
                lr_poly = Ridge(alpha=alpha)
                lr_poly.fit(x_scale, self.y)
                preds_poly = lr_poly.predict(x_scale)
                print(f'Results for column {col} with alpha = {alpha}')
                print('*********************************')
                print('R^2 Score after poly:', r2_score(self.y, preds_poly))
                plt.scatter(x_features, self.y)
                plt.plot(x_features, preds_poly, color='r')
                plt.show()
                print('*********************************')
    def gradient_descent(self):
        print('Gradient Descent Algorithm')
        print('*********************************')
        for col in self.x:
            x_features = self.df[col].values.reshape(-1, 1)
            sgdr = SGDRegressor()
            poly = PolynomialFeatures(degree=15)
            x_poly = poly.fit_transform(x_features)
            scale = StandardScaler()
            x_scale = scale.fit_transform(x_poly)
            x_scaled_1 = scale.fit_transform(x_features)
            sgdr.fit(x_scaled_1, self.y)
            preds_sgdr = sgdr.predict(x_scaled_1)
            plt.scatter(x_features, self.y)
            plt.plot(x_features, preds_sgdr, color='r')
            plt.show()
            mean_sq = mean_squared_error(self.y, preds_sgdr)
            r2 = r2_score(self.y, preds_sgdr)
            mean_abs = mean_absolute_error(self.y, preds_sgdr)
            print(f'Results for {col}')
            print('*********************************')
            print(f'Mean Squared Error = {mean_sq}')
            print(f'R^2 Score = {r2}')
            print(f'Mean Absolute Error = {mean_abs}')
            poly = PolynomialFeatures(degree=15)
            x_poly = poly.fit_transform(x_features)
            scale = StandardScaler()
            x_scale = scale.fit_transform(x_poly)
            lr_poly = SGDRegressor()
            lr_poly.fit(x_scale, self.y)
            preds_poly = lr_poly.predict(x_scale)
            plt.scatter(x_features, self.y)
            plt.plot(x_features, preds_poly, color='r')
            plt.show()
            mean_sq = mean_squared_error(self.y, preds_poly)
            r2 = r2_score(self.y, preds_poly)
            mean_abs = mean_absolute_error(self.y, preds_poly)
            print(f'Results for {col}')
            print(f'Mean Squared Error = {mean_sq}')
            print(f'R^2 Score = {r2}')
            print(f'Mean Absolute Error = {mean_abs}')
            print('*********************************')

    def multilinear_regression(self):
        print('Multilinear Regression Algorithm')
        print('*********************************')
        msc = MinMaxScaler()
        for col in self.x:
            x_features = self.df[col].values.reshape(-1, 1)
            x_scale = msc.fit_transform(x_features)
            print(f'Results for {col} after scaling:')
            print('*********************************')
            print('x after scaling = ', x_scale[0:5])
            lr = LinearRegression()
            lr.fit(x_scale, self.y)
            preds = lr.predict(x_scale)
            print('R^2 Score =', r2_score(self.y, preds))
            plt.scatter(x_features, self.y)
            plt.plot(x_features, preds, color='blue')
            plt.show()
            poly = PolynomialFeatures(degree=4)
            x_poly = poly.fit_transform(x_scale)
            x_scaled_poly = msc.fit_transform(x_poly)
            lr.fit(x_scaled_poly, self.y)
            preds_poly = lr.predict(x_scaled_poly)
            print(f'Results for column {col} after polynomial features:')
            print('R^2 Score =', r2_score(self.y, preds_poly))
            plt.scatter(x_features, self.y)
            plt.plot(x_features, preds_poly, color='r')
            plt.show()
            print('*********************************')

In [None]:
class Classification:
    def __init__(self, df, target):
        self.df = df
        self.target = target
        self.x = self.df.drop(self.target, axis=1)
        self.x = StandardScaler().fit_transform(self.x)
        self.y = self.df[self.target]

    def logistic_regression(self):
        print('Logistic Regression Algorithm')
        print('*********************************')
        x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2, shuffle=True)
        
        clf = LogisticRegression()
        clf.fit(x_train, y_train)
        train_pred = clf.predict(x_train)
        test_pred = clf.predict(x_test)
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_train, train_pred))
        disp.plot(cmap='viridis')
        plt.title('Confusion matrix for train')
        plt.show()
        print('Accuracy Score for train', accuracy_score(y_train, train_pred))
        print('Precision Score for train', precision_score(y_train, train_pred))
        print('Recall Score for train', recall_score(y_train, train_pred))
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, test_pred))
        disp.plot(cmap='Blues')
        plt.title('Confusion matrix for test')
        plt.show()
        print('Accuracy Score for test', accuracy_score(y_test, test_pred))
        print('Precision Score for test', precision_score(y_test, test_pred))
        print('Recall Score for test', recall_score(y_test, test_pred))
        print('*********************************')

    def support_vector_machine(self):
        print('Support Vector Machine Algorithm')
        print('*********************************')

        x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2, shuffle=True)
        
        clf = LogisticRegression()
        clf.fit(x_train, y_train)
        train_pred = clf.predict(x_train)
        test_pred = clf.predict(x_test)
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_train, train_pred))
        disp.plot(cmap='viridis')
        plt.title('Confusion matrix for train')
        plt.show()
        print('Accuracy Score for train', accuracy_score(y_train, train_pred))
        print('Precision Score for train', precision_score(y_train, train_pred))
        print('Recall Score for train', recall_score(y_train, train_pred))
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, test_pred))
        disp.plot(cmap='Blues')
        plt.title('Confusion matrix for test')
        plt.show()
        print('Accuracy Score for test', accuracy_score(y_test, test_pred))
        print('Precision Score for test', precision_score(y_test, test_pred))
        print('Recall Score for test', recall_score(y_test, test_pred))
        print('*********************************')
    
    def k_nearest_neighbor(self):
        print('K Nearest Neighbor Algorithm')
        print('*********************************')
       
        x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2, shuffle=True)
        clf = LogisticRegression()
        clf.fit(x_train, y_train)
        train_pred = clf.predict(x_train)
        test_pred = clf.predict(x_test)
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_train, train_pred))
        disp.plot(cmap='viridis')
        plt.title('Confusion matrix for train')
        plt.show()
        print('Accuracy Score for train', accuracy_score(y_train, train_pred))
        print('Precision Score for train', precision_score(y_train, train_pred))
        print('Recall Score for train', recall_score(y_train, train_pred))
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, test_pred))
        disp.plot(cmap='Blues')
        plt.title('Confusion matrix for test')
        plt.show()
        print('Accuracy Score for test', accuracy_score(y_test, test_pred))
        print('Precision Score for test', precision_score(y_test, test_pred))
        print('Recall Score for test', recall_score(y_test, test_pred))
        print('*********************************')

    def decision_tree(self):
        print('Decision Tree Algorithm')
        print('*********************************')
        x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2, shuffle=True)
        clf = LogisticRegression()
        clf.fit(x_train, y_train)
        train_pred = clf.predict(x_train)
        test_pred = clf.predict(x_test)
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_train, train_pred))
        disp.plot(cmap='viridis')
        plt.title('Confusion matrix for train')
        plt.show()
        print('Accuracy Score for train', accuracy_score(y_train, train_pred))
        print('Precision Score for train', precision_score(y_train, train_pred))
        print('Recall Score for train', recall_score(y_train, train_pred))
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, test_pred))
        disp.plot(cmap='Blues')
        plt.title('Confusion matrix for test')
        plt.show()
        print('Accuracy Score for test', accuracy_score(y_test, test_pred))
        print('Precision Score for test', precision_score(y_test, test_pred))
        print('Recall Score for test', recall_score(y_test, test_pred))
        print('*********************************')
        
    def random_forest(self):
        print('Random Forest Algorithm')
        print('*********************************')
        x_train, x_test, y_train, y_test = train_test_split(self.x, self.y, test_size=0.2, shuffle=True)
        clf = LogisticRegression()
        clf.fit(x_train, y_train)
        train_pred = clf.predict(x_train)
        test_pred = clf.predict(x_test)
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_train, train_pred))
        disp.plot(cmap='viridis')
        plt.title('Confusion matrix for train')
        plt.show()
        print('Accuracy Score for train', accuracy_score(y_train, train_pred))
        print('Precision Score for train', precision_score(y_train, train_pred))
        print('Recall Score for train', recall_score(y_train, train_pred))
        disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, test_pred))
        disp.plot(cmap='Blues')
        plt.title('Confusion matrix for test')
        plt.show()
        print('Accuracy Score for test', accuracy_score(y_test, test_pred))
        print('Precision Score for test', precision_score(y_test, test_pred))
        print('Recall Score for test', recall_score(y_test, test_pred))
        print('*********************************')

In [None]:
def main():
    print('Welcome to the simplified version of pycaret')
    df = load_data()
    df=data_preprocessing(df)
    data_info(df)
    x, y = choose_features(df)
    model_and_algorithm(x, y, df) 
if __name__ == "__main__":
    main()