In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_excel("satisfaction_2015.xlsx")

In [4]:
df.drop(columns="id",inplace=True)

In [13]:
df.dropna(inplace=True)

In [5]:
def One_Hot_Encoder(df,to_be_encoded):

    ### Input variable checks & errors

    wrong_column_name=[]
    
    if isinstance(df, pd.core.frame.DataFrame)==False:
        print("The first variable is not a pandas data frame, please enter a pandas data frame")
        return
    
    if len(df)==0:
        print("The data frame is empty, please check your data frame.")
        return
    
    if len(to_be_encoded)==0:
        print("The list of columns, to be encoded, is empty. Please check your list.")
        return
    
    for i in to_be_encoded:
        if i not in df.columns.tolist():
            wrong_column_name.append(i)
    
    if len(wrong_column_name)>0:
        if len(wrong_column_name)==1:
            print("There is no such a column as: {}".format(wrong_column_name))
        if len(wrong_column_name)>1:
            print("There are no columns as: {}".format(wrong_column_name))
        return
    
    ### N-1 Cluster Encoding

    for i in to_be_encoded:
        dummies=pd.get_dummies(df[i],drop_first=True,prefix=i).astype(int)
        df=pd.concat([df,dummies],axis=1)
        df.drop(columns=i,inplace=True)
    
    print("encoded_df is prepared")
    return df

In [6]:
a=["satisfaction_v2","Gender","Customer Type","Class","Type of Travel"]

In [15]:
encoded_df=One_Hot_Encoder(df,a)

encoded_df is prepared


In [16]:
encoded_df

Unnamed: 0,Age,Flight Distance,Inflight wifi service,Departure/Arrival time convenient,Ease of Online booking,Gate location,Food and drink,Online boarding,Seat comfort,Inflight entertainment,...,Inflight service,Cleanliness,Departure Delay in Minutes,Arrival Delay in Minutes,satisfaction_v2_satisfied,Gender_Male,Customer Type_disloyal Customer,Class_Eco,Class_Eco Plus,Type of Travel_Personal Travel
0,56,369,0,2,0,4,3,0,3,3,...,4,3,0,0.0,1,1,1,1,0,1
1,49,2486,0,2,1,4,2,1,3,2,...,3,2,0,0.0,1,1,1,1,0,1
2,55,1448,0,3,0,4,3,0,3,3,...,3,3,0,0.0,1,1,1,1,0,1
3,36,1501,0,4,0,3,4,0,4,4,...,5,4,0,0.0,1,0,1,1,0,1
4,55,577,0,5,0,3,3,5,3,3,...,4,3,0,0.0,1,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,35,2592,5,5,5,5,4,2,4,5,...,5,4,0,0.0,1,0,0,0,0,0
129876,11,610,5,1,1,1,5,5,5,5,...,5,5,82,65.0,1,0,0,1,0,0
129877,46,86,1,1,1,1,5,5,5,5,...,2,2,0,0.0,1,1,0,0,0,0
129878,35,83,1,1,1,1,5,5,5,5,...,5,5,0,0.0,1,0,0,0,0,0


## __Reference Modeling__

In [9]:
def Ref_Modeling(df, target_column, feature_columns, regression_type):
    """
    Perform regression based on user input.

    Parameters:
    - df (pandas.core.frame.DataFrame): Input DataFrame.
    - target_column (str): Name of the target variable to predict.
    - feature_columns (list): List of feature variables.
    - regression_type (str): Type of regression. Options: "lr" (linear regression), "bc" (binary classification), "mcc" (multiclass classification).
    - dataframe must not have any missing value
    - it would be better to use an encoded data frame for categoric features
    
    Returns:
    - None
    """
    #Import necessary libraries
    from sklearn.model_selection import train_test_split
    from sklearn.linear_model import LinearRegression
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
    import matplotlib.pyplot as plt


    # Input variable checks & errors
    if not isinstance(df, pd.core.frame.DataFrame):
        print("The input is not a pandas data frame. Please enter a pandas data frame.")
        return
    
    if len(df) == 0:
        print("The data frame is empty. Please check your data frame.")
        return
    
    if target_column not in df.columns or not set(feature_columns).issubset(df.columns):
        print("Target or feature column(s) not found in the DataFrame. Please check your column names.")
        return
    
    if regression_type not in ["lr", "bc", "mcc"]:
        print("Invalid regression type. Please choose one of: 'lr' (linear regression), 'bc' (binary classification), 'mcc' (multiclass classification).")
        return
    
    # Split the data into training and testing sets
    X = df[feature_columns]
    y = df[target_column]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train the regression model based on user input
    if regression_type == "lr":
        model = LinearRegression()
    elif regression_type == "bc":
        model = LogisticRegression()
    elif regression_type == "mcc":
        model = LogisticRegression(multi_class="auto")

    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate and display model metrics
    print("\nModel Evaluation Metrics:")
    if regression_type == "lr":
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print("Mean Squared Error: {:.2f}".format(mse))
        print("R-squared: {:.2f}".format(r2))
    elif regression_type == "bc" or regression_type == "mcc":
        accuracy = accuracy_score(y_test, y_pred)
        print("Accuracy: {:.2f}%".format(accuracy * 100))
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))

    #Plot the regression line for linear regression
    if regression_type == "lr":
        plt.figure(figsize=(10, 6))
        plt.scatter(X_test.iloc[:, 0], y_test, color='blue', label='Actual')
        plt.plot(X_test.iloc[:, 0], y_pred, color='red', linewidth=2, label='Regression Line')
        plt.title('Linear Regression')
        plt.xlabel(feature_columns[0])
        plt.ylabel(target_column)
        plt.legend()
        plt.show()

# Kullanım örneği:
# perform_regression(df, target_column="Target", feature_columns=["Feature1", "Feature2"], regression_type="lr")

In [19]:
Ref_Modeling(encoded_df,"satisfaction_v2_satisfied",encoded_df.drop(columns="satisfaction_v2_satisfied").columns,"bc")


Model Evaluation Metrics:
Accuracy: 80.89%

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.81      0.83     14701
           1       0.76      0.81      0.79     11197

    accuracy                           0.81     25898
   macro avg       0.81      0.81      0.81     25898
weighted avg       0.81      0.81      0.81     25898



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
