In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ydata_profiling as pf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

from ipywidgets import widgets
from IPython.display import display, HTML, clear_output

class LoanFraudDetectionNotebook:
    def __init__(self):
        self.create_widgets()

    def create_widgets(self):
        self.upload_button = widgets.FileUpload(accept='.csv', multiple=False)
        self.upload_button.observe(self.on_upload_change, names='value')
        self.process_button = widgets.Button(description="Process Data")
        self.process_button.on_click(self.on_process_click)
        self.process_button.disabled = True
        self.output = widgets.Output()

        display(HTML("<h1>Loan Fraud Detection System</h1>"))
        display(self.upload_button, self.process_button, self.output)

    def on_upload_change(self, change):
        if change['new']:
            self.file_name = next(iter(change['new']))
            self.process_button.disabled = False
            with self.output:
                clear_output()
                print(f"Selected file: {self.file_name}")

    def on_process_click(self, b):
        with self.output:
            clear_output()
            print("Processing data...")
            self.process_data()

    def process_data(self):
        try:
            content = self.upload_button.value[self.file_name]['content']
            self.df = pd.read_csv(pd.compat.BytesIO(content))
            self.display_data_overview()
            self.perform_eda()
            self.create_visualizations()
            self.train_models()
            print("Data processing completed successfully!")
        except Exception as e:
            print(f"An error occurred: {str(e)}")

    def display_data_overview(self):
        display(HTML("<h2>Data Overview</h2>"))
        display(HTML("<h3>First Five Rows:</h3>"))
        display(self.df.head())
        display(HTML(f"<p>Shape: {self.df.shape}</p>"))
        display(HTML("<h3>Null Values:</h3>"))
        display(self.df.isnull().sum())
        display(HTML("<h3>DataFrame Info:</h3>"))
        buffer = io.StringIO()
        self.df.info(buf=buffer)
        display(HTML(f"<pre>{buffer.getvalue()}</pre>"))
        display(HTML("<h3>Descriptive Statistics:</h3>"))
        display(self.df.describe().T)

    def perform_eda(self):
        display(HTML("<h2>Exploratory Data Analysis</h2>"))
        profile = pf.ProfileReport(self.df)
        profile.to_file("EDA_Report.html")
        print("EDA Report generated and saved as 'EDA_Report.html'")
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(self.df.corr(), cmap='coolwarm', annot=True)
        plt.title('Correlation Matrix')
        plt.show()

    def create_visualizations(self):
        display(HTML("<h2>Visualizations</h2>"))
        self.plot_loan_purpose_repayment()
        self.plot_credit_policy_fico()
        self.plot_loan_repayment_fico()
        self.plot_loan_purpose_pie()
        self.plot_loan_repayment_pie()
        self.plot_correlation_heatmap()

    def plot_loan_purpose_repayment(self):
        plt.figure(figsize=(12, 6))
        sns.countplot(x='purpose', hue='not.fully.paid', data=self.df)
        plt.title('Count of Customer based on Loan Purpose and Repayment')
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.show()

    def plot_credit_policy_fico(self):
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
        self.df[self.df['credit.policy'] == 1]['fico'].hist(ax=ax1, bins=30, alpha=0.5)
        ax1.set_title('Distribution of Credit Policy [1] & FICO')
        self.df[self.df['credit.policy'] == 0]['fico'].hist(ax=ax2, bins=30, alpha=0.5)
        ax2.set_title('Distribution of Credit Policy [0] & FICO')
        plt.tight_layout()
        plt.show()

    def plot_loan_repayment_fico(self):
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 8))
        self.df[self.df['not.fully.paid'] == 1]['fico'].hist(ax=ax1, bins=30, alpha=0.5)
        ax1.set_title('Distribution of Not Fully Paid & FICO')
        self.df[self.df['not.fully.paid'] == 0]['fico'].hist(ax=ax2, bins=30, alpha=0.5)
        ax2.set_title('Distribution of Fully Paid & FICO')
        plt.tight_layout()
        plt.show()

    def plot_loan_purpose_pie(self):
        plt.figure(figsize=(10, 8))
        self.df['purpose'].value_counts().plot.pie(autopct='%1.1f%%')
        plt.title('Pie Representation on Percentage of Loan Purpose')
        plt.show()

    def plot_loan_repayment_pie(self):
        plt.figure(figsize=(10, 8))
        self.df['not.fully.paid'].value_counts().plot.pie(autopct='%1.1f%%', labels=['Paid', 'Not Paid'])
        plt.title('Proportion of Customer with Paid and Not Paid')
        plt.show()

    def plot_correlation_heatmap(self):
        plt.figure(figsize=(12, 10))
        sns.heatmap(self.df.corr(), cmap='coolwarm', annot=True)
        plt.title('Correlation Heat Map')
        plt.show()

    def train_models(self):
        display(HTML("<h2>Model Results</h2>"))
        df1 = pd.get_dummies(self.df, columns=['purpose'], drop_first=True)
        X = df1.drop(['not.fully.paid'], axis=1)
        y = df1['not.fully.paid']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
        model = XGBClassifier(random_state=42)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        print(f"Model Accuracy: {accuracy:.2%}")

        plt.figure(figsize=(10, 6))
        feature_importance = model.feature_importances_
        feature_names = X.columns
        sorted_idx = np.argsort(feature_importance)
        pos = np.arange(sorted_idx.shape[0]) + .5
        plt.barh(pos, feature_importance[sorted_idx], align='center')
        plt.yticks(pos, feature_names[sorted_idx])
        plt.xlabel('Feature Importance')
        plt.title('XGBoost Feature Importance')
        plt.show()

        final_predictions = model.predict(X)
        final_df = self.df.copy()
        final_df['Predicted_Loan_Status'] = final_predictions
        final_df.to_csv('Loan_Status_Prediction.csv', index=False)
        print("Predictions saved to 'Loan_Status_Prediction.csv'")

# Create and display the notebook interface
loan_fraud_detection = LoanFraudDetectionNotebook()


In [None]:
loan_fraud_detection = LoanFraudDetectionNotebook()