# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport
from dataprep.eda import create_report

# Load Data

In [None]:
FILE_PATH = '' # TODO: Add file path

df = pd.read_csv(FILE_PATH)

# Data Overview

## General

In [None]:
# Display the first 5 rows of the DataFrame
print(df.head())

# Display a concise summary of the DataFrame
print(df.info())

# Generate descriptive statistics of the DataFrame
print(df.describe())

## AutoEDA

### [Pandas Profiling](https://docs.profiling.ydata.ai/4.6/)

**Pandas Profiling** is a relatively mature tool that can complete the analysis process by directly inputting a DataFrame. The results are displayed in HTML format, and the analysis functions are also quite powerful.

* Features: Field type analysis, variable distribution analysis, correlation analysis, missing value analysis, duplicate row analysis

* Time consumption: short

![Pandas Profiling](../data/img/ydata-profiling.gif)

In [None]:
profile = ProfileReport(df, title="Profiling Report")

### [Dataprep](https://dataprep.ai/)

**Dataprep** is a flexible and powerful tool. It can specify columns for analysis and also perform interactive analysis in the Notebook.

* Features: Field type analysis, variable distribution analysis, correlation analysis, missing value analysis, interactive analysis.
* Time consumption: Longer

![Dataprep](../data/img/dataprep.png)

In [None]:
# Create the EDA report
report = create_report(df)

# Display the report
report.show_browser()

# Data Cleaning

* Check for missing values and handle them appropriately.
* Check for duplicate entries and decide whether to keep them or not.
* Check for data types of the columns and make necessary changes.

In [None]:
# Check for missing values
print(df.isnull().sum())

# Fill missing values with mean of the column
# df.fillna(df.mean(), inplace=True)

In [None]:
# Check for duplicate entries
print(df.duplicated().sum())

# Remove duplicate entries
# df.drop_duplicates(inplace=True)

In [None]:


# Check data types of the columns
print(df.dtypes)

# Convert a column to a correct data type (example: 'column_name' to datetime)
# df['column_name'] = pd.to_datetime(df['column_name'])

In [None]:
# Check for outliers using the IQR method
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1

# Define a condition for outliers
condition = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))

# Print the number of outliers in each column
print(condition.sum())

# Univariate Analysis

In [None]:
def perform_univariate_analysis(df):
    """
    Perform univariate analysis on all columns of a DataFrame.

    :param df: pandas DataFrame
    """
    for column in df.columns:
        # If the column is numerical
        if pd.api.types.is_numeric_dtype(df[column]):
            print(f"\nPerforming univariate analysis on numerical column: {column}")

            # Plot a histogram
            plt.figure(figsize=(10, 6))
            plt.hist(df[column], bins=30, color='skyblue')
            plt.title(f'Histogram of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            plt.show()

            # Plot a box plot
            plt.figure(figsize=(10, 6))
            plt.boxplot(df[column])
            plt.title(f'Box Plot of {column}')
            plt.ylabel(column)
            plt.show()

        # If the column is categorical
        elif pd.api.types.is_string_dtype(df[column]):
            print(f"\nPerforming univariate analysis on categorical column: {column}")

            # Plot a bar chart
            plt.figure(figsize=(10, 6))
            df[column].value_counts().plot(kind='bar', color='skyblue')
            plt.title(f'Bar Chart of {column}')
            plt.xlabel(column)
            plt.ylabel('Frequency')
            plt.show()

perform_univariate_analysis(df)

# Bivariate Analysis

In [None]:
def perform_bivariate_analysis(df):
    """
    Perform bivariate analysis on all pairs of columns in a DataFrame.

    :param df: pandas DataFrame
    """
    # Get list of columns
    columns = df.columns.tolist()

    # Iterate over each pair of columns
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            column1 = columns[i]
            column2 = columns[j]

            # If both columns are numerical
            if pd.api.types.is_numeric_dtype(df[column1]) and pd.api.types.is_numeric_dtype(df[column2]):
                print(f"\nPerforming bivariate analysis on numerical columns: {column1} and {column2}")

                # Plot a scatter plot
                plt.figure(figsize=(10, 6))
                plt.scatter(df[column1], df[column2], color='skyblue')
                plt.title(f'Scatter Plot of {column1} and {column2}')
                plt.xlabel(column1)
                plt.ylabel(column2)
                plt.show()

            # If both columns are categorical
            elif pd.api.types.is_string_dtype(df[column1]) and pd.api.types.is_string_dtype(df[column2]):
                print(f"\nPerforming bivariate analysis on categorical columns: {column1} and {column2}")

                # Create a two-way table
                two_way_table = pd.crosstab(df[column1], df[column2])
                print(two_way_table)

perform_bivariate_analysis(df)

# Multivariate Analysis

In [None]:
def perform_multivariate_analysis(df):
    """
    Perform multivariate analysis on all columns of a DataFrame.

    :param df: pandas DataFrame
    """
    # Plot a pair plot for all numerical variables
    sns.pairplot(df)
    plt.show()

    # Calculate the correlation matrix
    corr = df.corr()

    # Plot a heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(corr, annot=True)
    plt.show()

perform_multivariate_analysis(df)

# Correlation Analysis

In [None]:
def perform_multivariate_analysis(df):
    """
    Perform multivariate analysis on all columns of a DataFrame.

    :param df: pandas DataFrame
    """
    # Plot a pair plot for all numerical variables
    sns.pairplot(df)
    plt.show()

    # Calculate the correlation matrix
    corr = df.corr()

    # Print the correlation matrix
    print("\nCorrelation Matrix:")
    print(corr)

    # Find pairs of variables that have a high correlation
    high_corr_pairs = [(col1, col2) for col1 in corr.columns for col2 in corr.columns if abs(corr[col1][col2]) > 0.75 and col1 != col2]
    print("\nPairs of variables with high correlation:")
    for pair in high_corr_pairs:
        print(pair)

    # Plot a heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(corr, annot=True)
    plt.show()

perform_multivariate_analysis(df)