In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy.stats import chi2_contingency
from IPython.display import display
from itertools import combinations
from scipy.stats import kstest, spearmanr, pearsonr
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames


class AutoEDA:

    def __init__(self):
        self.colors = ["#2146B2", "#E0CA27", "#F8C895", "#D98162", "#F2EFEB", "#26261B"]

    def read_file(self, file_path):
        """
        Reads a file and returns a pandas DataFrame.
        
        Parameters:
        - file_path (str): The path to the file.
        
        Returns:
        - DataFrame: A pandas DataFrame.
        """
        # Determine the file extension
        file_extension = file_path.split('.')[-1].lower()
        
        # Read the file based on the file extension
        try:
            if file_extension == 'csv':
                return pd.read_csv(file_path, index_col=0)
            elif file_extension in ['xls', 'xlsx']:
                return pd.read_excel(file_path)
            elif file_extension == 'json':
                return pd.read_json(file_path)
            elif file_extension == 'pkl':
                return pd.read_pickle(file_path)
            else:
                raise ValueError("Reading this format is not yet implemented.")
        except Exception as e:
            return f"File reading failed, error: {e}."


    def explo_df(self, DataFrame, column=None):
        """
        Explores a DataFrame or a specific column and prints various statistics.
        
        Parameters:
        - DataFrame: The DataFrame to explore.
        - column (str or list, optional): The column or columns to explore. If None, explore the entire DataFrame.
        """
        if column is None:
            # General DataFrame exploration
            print("DataFrame Information:")
            display(DataFrame.info())
            print("\nFirst 10 rows of the DataFrame:")
            display(DataFrame.head(10))
            print("\nLast 10 rows of the DataFrame:")
            display(DataFrame.tail(10))
            print("\nStatistical description of the DataFrame (numeric):")
            display(DataFrame.describe().T)
            print("\nStatistical description of the DataFrame (categorical):")
            display(DataFrame.describe(include='object').T)
            print("\nCount of null values per column:")
            display(DataFrame.isnull().sum())
            print("\nPercentage of null values per column (only columns with nulls):")
            null_percentage = round(DataFrame.isnull().sum()/DataFrame.shape[0]*100, 2)
            display(null_percentage[null_percentage > 0])
            print("\nRows with all values as null:")
            all_null_rows = DataFrame[DataFrame.isnull().all(axis=1)]
            if not all_null_rows.empty:
                display(all_null_rows)
            else:
                print("There is no rows with all values as null.")
            print("\nCount of duplicate rows:")
            display(DataFrame.duplicated().sum())
        else:
            # Column(s) exploration
            if isinstance(column, str):
                column = [column]  # Convert to list if a single column is passed as a string
            for col in column:
                print(f"\nExploration of the column: {col}")
                if DataFrame[col].dtype in ['int64', 'float64']:
                    print("\nStatistical description (numeric):")
                    display(DataFrame[col].describe())
                else:
                    print("\nStatistical description (categorical):")
                    display(DataFrame[col].describe(include='object'))
                print(f"\nCount of null values: {DataFrame[col].isnull().sum()}")
                print(f"\nUnique values: {DataFrame[col].unique()}")
                print(f"\nValue Counts: {DataFrame[col].value_counts()}")
                print(f"\nCount of duplicates in the column: {DataFrame.duplicated(subset=[col]).sum()}")

    
    def __identify_linearity(self, dataframe, column_combinations_list):
        """
        Identifies if the relationships between pairs of variables in a DataFrame are linear or not.

        Parameters:
        -----------
        dataframe : pandas.DataFrame
            The DataFrame containing the variables to be analyzed.

        column_combinations_list : list of tuples
            A list of tuples where each tuple contains two column names from the DataFrame to be analyzed.

        Returns:
        --------
        linear_relationships : list of tuples
            A list of tuples containing the names of the columns that have a linear relationship.

        non_linear_relationships : list of tuples
            A list of tuples containing the names of the columns that do not have a linear relationship.
        """
        linear_relationships = []
        non_linear_relationships = []

        for pair in column_combinations_list: 
            # Perform the normality test
            _, p_value1 = kstest(dataframe[pair[0]], "norm")
            _, p_value2 = kstest(dataframe[pair[1]], "norm")

            if p_value1 > 0.05 and p_value2 > 0.05:
                linear_relationships.append(pair)
            else:
                non_linear_relationships.append(pair)

        return linear_relationships, non_linear_relationships


    def numeric_correlations(self, dataframe):
        """
        Identifies correlations among numeric columns in the dataframe using Pearson or Spearman methods.

        Parameters:
        -----------
        dataframe : pandas.DataFrame
            The DataFrame containing the variables to analyze.

        Returns:
        --------
        results : dict
            A dictionary containing the correlation DataFrames. The keys are 'pearson' and 'spearman'.
            If all relationships are either linear or non-linear, only one key will be present.
        """
        # Select numeric columns
        numerics = dataframe.select_dtypes(include=np.number).columns
        
        # Generate all possible combinations of numeric columns
        num_combinations = list(combinations(numerics, 2))
        
        # Identify if the relationships are linear or non-linear
        linear, non_linear = self.__identify_linearity(dataframe, num_combinations)
        
        # Initialize the results dictionary
        results = {}

        if linear:
            # Apply Pearson correlation for linear relationships
            linear_columns = set([item for sublist in linear for item in sublist])
            df_pearson = dataframe[list(linear_columns)].corr(method="pearson")
            results['pearson'] = df_pearson

        if non_linear:
            # Apply Spearman correlation for non-linear relationships
            non_linear_columns = set([item for sublist in non_linear for item in sublist])
            df_spearman = dataframe[list(non_linear_columns)].corr(method="spearman")
            results['spearman'] = df_spearman
        
        return results


    def classify_correlations(self, correlation_df):
        """
        Classify the correlations in the given DataFrame into weak, moderate, and strong correlations.
        
        Parameters:
        -----------
        correlation_df : pandas.DataFrame
            DataFrame containing the correlation values between pairs of variables.
        
        Returns:
        --------
        None
        """
        weak_correlations = []
        moderate_correlations = []
        strong_correlations = []

        # To avoid duplicates, use a set to register processed pairs
        processed_pairs = set()

        for row in correlation_df.index:
            for col in correlation_df.columns:
                if row != col and (col, row) not in processed_pairs:
                    corr_value = correlation_df.at[row, col]
                    processed_pairs.add((row, col))
                    processed_pairs.add((col, row))

                    if 0.1 <= abs(corr_value) < 0.3:
                        weak_correlations.append((row, col, corr_value))
                    elif 0.3 <= abs(corr_value) < 0.7:
                        moderate_correlations.append((row, col, corr_value))
                    elif abs(corr_value) >= 0.7:
                        strong_correlations.append((row, col, corr_value))

        # Print the results
        print("Weak Correlations:")
        for item in weak_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        print("\nModerate Correlations:")
        for item in moderate_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        print("\nStrong Correlations:")
        for item in strong_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        # Return None, as we're only printing the results
        return None
        # return weak_correlations, moderate_correlations, strong_correlations


    def identify_categorical_cols(self, df):
        return df.select_dtypes(include='O').columns
 
    def __cramers_v(self, confusion_matrix):
        chi2 = chi2_contingency(confusion_matrix)[0]
        n = confusion_matrix.sum().sum()
        r, k = confusion_matrix.shape
        return np.sqrt(chi2 / (n * (min(k, r) - 1)))

    def categorical_correlations(self, df, categorical_columns):
        correlations = []
        processed_pairs = set()  # Para rastrear los pares ya procesados

        
        for col1 in categorical_columns:
            for col2 in categorical_columns:
                if col1 != col2 and (col2, col1) not in processed_pairs:
                    confusion_matrix = pd.crosstab(df[col1], df[col2])
                    correlation = self.__cramers_v(confusion_matrix)
                    correlations.append((col1, col2, correlation))
                    processed_pairs.add((col1, col2))  # Añadir el par a los procesados

        weak_correlations = [item for item in correlations if 0.1 <= item[2] < 0.3]
        moderate_correlations = [item for item in correlations if 0.3 <= item[2] < 0.5]
        strong_correlations = [item for item in correlations if item[2] >= 0.5]
    
        # Print the results
        print("Weak Correlations:")
        for item in weak_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        print("\nModerate Correlations:")
        for item in moderate_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        print("\nStrong Correlations:")
        for item in strong_correlations:
            print(f"Between {item[0]} and {item[1]}: {item[2]:.2f}")

        # Return None, as we're only printing the results
        return None


    def plot_histogram(self, df, column, bins=10, title=None, xlabel=None, ylabel='Frequency'):
        """
        Plot a histogram for a given column in the DataFrame.

        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data.
        column : str
            The column for which the histogram is to be plotted.
        bins : int, optional (default=10)
            Number of bins for the histogram.
        title : str, optional
            Title of the plot.
        xlabel : str, optional
            Label for the x-axis.
        ylabel : str, optional (default='Frequency')
            Label for the y-axis.
        """
        color = self.colors[0] # Choose a color from the color palette
        plt.figure(figsize=(8, 4))
        plt.hist(df[column].dropna(), bins=bins, edgecolor='k', color=color)
        plt.title(title if title else f'Histogram of {column}')
        plt.xlabel(xlabel if xlabel else column)
        plt.ylabel(ylabel)
        plt.show()


    def plot_scatter(self, df, x_column, y_column, title=None, xlabel=None, ylabel=None):
        """
        Plot a scatter plot for two given columns in the DataFrame.

        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data.
        x_column : str
            The column for the x-axis.
        y_column : str
            The column for the y-axis.
        title : str, optional
            Title of the plot.
        xlabel : str, optional
            Label for the x-axis.
        ylabel : str, optional
            Label for the y-axis.
        """
        color = self.colors[1] # Choose a color from the color palette
        plt.figure(figsize=(8, 4))
        plt.scatter(df[x_column], df[y_column], color=color)
        plt.title(title if title else f'Scatter Plot of {x_column} vs {y_column}')
        plt.xlabel(xlabel if xlabel else x_column)
        plt.ylabel(ylabel if ylabel else y_column)
        plt.show()


    def plot_boxplot(self, df, column1, column2=None, title=None, xlabel=None, ylabel='Value'):
        """
        Plot a boxplot for one column or a comparison of two columns in the DataFrame.

        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data.
        column1 : str
            The primary column for which the boxplot is to be plotted.
        column2 : str, optional
            The secondary column to compare with the primary column (for grouped boxplot).
        title : str, optional
            Title of the plot.
        xlabel : str, optional
            Label for the x-axis.
        ylabel : str, optional (default='Value')
            Label for the y-axis.
        """
        if column2 is None:
            # Single column boxplot
            plt.figure(figsize=(8, 4))
            sns.boxplot(y=df[column1], color=self.colors[3])
            plt.title(title if title else f'Boxplot of {column1}')
            plt.ylabel(ylabel)
            plt.show()
        else:
            # Comparison of two columns
            plt.figure(figsize=(10, 6))
            sns.boxplot(x=df[column2], y=df[column1], palette=self.colors)
            plt.title(title if title else f'Boxplot of {column1} by {column2}')
            plt.xlabel(column2)
            plt.ylabel(ylabel)
            plt.show()


    def visualize_pairplot(self, dataframe, columns, hue=None, height=5):
        """
        Visualize pair plots for selected columns of the DataFrame.
        
        Parameters:
        -----------
        dataframe : pandas.DataFrame
            The DataFrame containing the data to be visualized.
        
        columns : list of str
            List of column names to be included in the pair plot.
        
        hue : str, optional, default: None
            Column name to be used for color encoding.
        
        height : int, optional, default: 5
            Height of each facet in inches.
        
        Returns:
        --------
        None
        """
        if not all(col in dataframe.columns for col in columns):
            raise ValueError("One or more columns are not in the DataFrame.")
        
        # Create the pairplot with the specified hue and color palette
        pairplot = sns.pairplot(dataframe[columns + [hue]] if hue else dataframe[columns], 
                               hue=hue, palette=self.colors, height=height)

        plt.tight_layout()
        plt.show()

    def visualize_categorical_counts(self, dataframe, categorical_cols):
        """
        Visualize count plots for categorical columns in the DataFrame.
        
        Parameters:
        -----------
        dataframe : pandas.DataFrame
            The DataFrame containing the data to be visualized.
        
        categorical_cols : list of str
            List of categorical column names to be included in the count plots.
        
        Returns:
        --------
        None
        """
        num_cols = len(categorical_cols)
        num_rows = (num_cols + 1) // 2  # Calculate number of rows needed for subplots
        fig, ax = plt.subplots(num_rows, 2, figsize=(10, num_rows * 4))
        fig.subplots_adjust(hspace=0.5)
        
        # Flatten the axes array for easier indexing
        if num_rows > 1:
            ax = ax.flatten()
        else:
            ax = [ax]
        
        def count_plotter(ax, col, data, colors):
            counted = data[col].value_counts()
            palette = colors[:len(counted)]  # Use only as many colors as there are categories
            sns.barplot(ax=ax, x=counted.index, y=counted.values, width=0.9, palette=palette)
            ax.set_title(f"{col} count graph")
            if col in ['JobRole', 'EducationField']:
                ax.set_xticklabels(labels=counted.index, rotation=90, fontsize=6)
            else:
                ax.set_xticklabels(labels=counted.index, fontsize=8)

        for i, category in enumerate(categorical_cols):
            if i < len(ax):
                count_plotter(ax[i], category, data=dataframe, colors=self.colors)
            else:
                # Hide unused subplots
                ax[i].axis('off')

        plt.show()

    def visualize_facet_grid(self, df, col_names, x_values):
        """
        Create FacetGrid plots for given categorical columns and numerical values.

        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data to be visualized.
        
        col_names : list of str
            List of categorical column names to be used for facets.
        
        x_values : list of str
            List of numerical column names to be plotted on the x-axis.
        
        hue : str, optional, default: 'Gender'
            Column name to be used for color encoding.
        
        Returns:
        --------
        None
        """
        if len(col_names) != len(x_values):
            raise ValueError("Length of col_names and x_values must be the same.")
        
        for col_name, x_value in zip(col_names, x_values):
            facet = sns.FacetGrid(df, col=col_name, hue=hue, aspect=1, palette=self.colors, col_wrap=3)
            facet.map(sns.kdeplot, x_value, fill=True)
            facet.set(xlim=[0, df[x_value].max()])
            facet.add_legend(label_order=df[hue].unique())
            plt.tight_layout()
            plt.show()

    def visualize_general_statistics(self, df, df_heatmap1, df_heatmap2):
        """
        Create a series of general statistics visualizations using subplots.
        
        Parameters:
        -----------
        df : pandas.DataFrame
            The DataFrame containing the data to be visualized.
        
        df_heatmap1 : pandas.DataFrame
            DataFrame for the first heatmap.
        
        df_heatmap2 : pandas.DataFrame
            DataFrame for the second heatmap.
        
        Returns:
        --------
        None
        """
        fig, ax = plt.subplots(3, 2, figsize=(15, 13))
        fig.suptitle('General Statistics')
        fig.subplots_adjust(wspace=0.4, hspace=0.5)
        
        # Boxplot for Department vs. Total Working Years
        sns.boxplot(ax=ax[0, 0], data=df, y='department', x='total_working_years', hue='gender', palette=self.colors[:2])
        ax[0, 0].set_title('Ages by Department', fontsize=14)
        
        # Boxplot for Education Field vs. Age
        sns.boxplot(ax=ax[0, 1], data=df, y='education_field', x='age', hue='gender', palette=self.colors[:2])
        ax[0, 1].set_title('Ages by Education Field', fontsize=14)
        
        # Heatmap for Job Role-Satisfaction Mapping
        sns.heatmap(ax=ax[1, 0], data=df_heatmap1, square=True, linewidth=1, cmap='Reds')
        ax[1, 0].set_title('Job Role-Satisfaction Mapping', fontsize=14)
        
        # Heatmap for Job Level-Involvement Mapping
        sns.heatmap(ax=ax[1, 1], data=df_heatmap2, square=True, linewidth=1, cmap='Blues')
        ax[1, 1].set_title('Job Level-Involvement Mapping', fontsize=14)
        
        # Histogram for Distribution of Salary Percent Hike
        sns.histplot(ax=ax[2, 0], data=df, x='percent_salary_hike', hue='gender', multiple='stack', palette=self.colors)
        ax[2, 0].set_title('Distribution of Salary Percent Hike', fontsize=14)
        
        # Histogram for Distribution of Years at Company
        sns.histplot(ax=ax[2, 1], data=df, x='years_at_company', hue='gender', multiple='stack', palette=self.colors)
        ax[2, 1].set_title('Distribution of Years at Company', fontsize=14)
        
        plt.show()


    def pieplot(self, data, columns, titles=None, explode_ratio=0.05):
        """
        Generates pie plots for specified columns in the dataframe and arranges them in a single row.

        Parameters:
        - data: pd.DataFrame, the dataframe containing the data.
        - columns: list of str, the names of the columns to plot.
        - titles: list of str, optional, titles for each pie plot.
        - explode_ratio: float, optional, the fraction by which to offset each wedge.

        Returns:
        - None, displays the pie plots.
        """
        def categorize_value(val):
            if val == 1:
                return 'Bajo'
            elif val == 2:
                return 'Medio Bajo'
            elif val == 3:
                return 'Medio Alto'
            elif val == 4:
                return 'Alto'

        # self.colors = ["#2146B2", "#E0CA27", "#F8C895", "#D98162", "#F2EFEB", "#26261B"]


        # Define a consistent color mapping for the categories
        color_mapping = {
            'Bajo': '#2146B2',        
            'Medio Bajo': '#E0CA27', 
            'Medio Alto': '#F8C895', 
            'Alto': '#D98162'      
        }

        num_columns = len(columns)
        
        if titles is None:
            titles = [None] * num_columns
        
        if num_columns == 0:
            raise ValueError("The 'columns' list must contain at least one column.")
        
        # Determine layout for subplots
        fig, axs = plt.subplots(1, num_columns, figsize=(num_columns * 5, 5))
        
        # Handle the case where there is only one column
        if num_columns == 1:
            axs = [axs]
        
        for ax, col, title in zip(axs, columns, titles):
            # Apply the categorization function to convert numeric values to categories
            categorized_data = data[col].apply(categorize_value).value_counts()
            
            # Create the explode configuration
            explode = [explode_ratio] * len(categorized_data)
            
            # Generate the pie plot
            wedges, texts, autotexts = ax.pie(
                categorized_data,
                labels=categorized_data.index,
                autopct='%1.1f%%',
                startangle=140,
                colors=[color_mapping[label] for label in categorized_data.index],
                explode=explode,
                shadow=True
            )
            
            # Set title if provided
            if title:
                ax.set_title(title, fontsize=16)
        
        plt.tight_layout()
        plt.show()


    def boxplot_distribution(self, data, category_column, value_column, title=None):    
        """
        Generates a boxplot to show the distribution of a numerical value across different categories.

        Parameters:
        - data: pd.DataFrame, the dataframe containing the data.
        - category_column: str, the name of the categorical column (e.g., products).
        - value_column: str, the name of the numerical column (e.g., price).
        - title: str, optional, the title of the plot.

        Returns:
        - None, displays the boxplot.
        """
        plt.figure(figsize=(12, 6))
        
        # Create the boxplot using seaborn
        sns.boxplot(x=data[category_column], y=data[value_column], palette=self.colors)
        
        # Add title if provided
        if title:
            plt.title(title, fontsize=16)
        
        # Add labels
        plt.xlabel(category_column.capitalize())
        plt.ylabel(value_column.capitalize())
        
        plt.xticks(rotation=45)  # Rotate x-axis labels if needed for better readability
        plt.tight_layout()
        plt.show()











In [4]:
df_eda = pd.read_csv("tripadvisor_european_restaurants.csv")


Unnamed: 0,restaurant_link,restaurant_name,original_location,country,region,province,city,address,latitude,longitude,...,excellent,very_good,average,poor,terrible,food,service,value,atmosphere,keywords
0,g10001637-d10002227,Le 147,"[""Europe"", ""France"", ""Nouvelle-Aquitaine"", ""Ha...",France,Nouvelle-Aquitaine,Haute-Vienne,Saint-Jouvent,"10 Maison Neuve, 87510 Saint-Jouvent France",45.961674,1.169131,...,2.0,0.0,0.0,0.0,0.0,4.0,4.5,4.0,,
1,g10001637-d14975787,Le Saint Jouvent,"[""Europe"", ""France"", ""Nouvelle-Aquitaine"", ""Ha...",France,Nouvelle-Aquitaine,Haute-Vienne,Saint-Jouvent,"16 Place de l Eglise, 87510 Saint-Jouvent France",45.95704,1.20548,...,2.0,2.0,1.0,0.0,0.0,,,,,
2,g10002858-d4586832,Au Bout du Pont,"[""Europe"", ""France"", ""Centre-Val de Loire"", ""B...",France,Centre-Val de Loire,Berry,Rivarennes,"2 rue des Dames, 36800 Rivarennes France",46.635895,1.386133,...,3.0,1.0,0.0,0.0,0.0,,,,,
3,g10002986-d3510044,Le Relais de Naiade,"[""Europe"", ""France"", ""Nouvelle-Aquitaine"", ""Co...",France,Nouvelle-Aquitaine,Correze,Lacelle,"9 avenue Porte de la Correze 19170, 19170 Lace...",45.64261,1.82446,...,1.0,0.0,0.0,0.0,0.0,4.5,4.5,4.5,,
4,g10022428-d9767191,Relais Du MontSeigne,"[""Europe"", ""France"", ""Occitanie"", ""Aveyron"", ""...",France,Occitanie,Aveyron,Saint-Laurent-de-Levezou,"route du Montseigne, 12620 Saint-Laurent-de-Le...",44.20886,2.96047,...,4.0,7.0,0.0,0.0,0.0,4.5,4.5,4.5,,


In [6]:
df_eda.head(5)

Unnamed: 0,restaurant_link,restaurant_name,original_location,country,region,province,city,address,latitude,longitude,claimed,awards,popularity_detailed,popularity_generic,top_tags,price_level,price_range,meals,cuisines,special_diets,features,vegetarian_friendly,vegan_options,gluten_free,original_open_hours,open_days_per_week,open_hours_per_week,working_shifts_per_week,avg_rating,total_reviews_count,default_language,reviews_count_in_default_language,excellent,very_good,average,poor,terrible,food,service,value,atmosphere,keywords
0,g10001637-d10002227,Le 147,"[""Europe"", ""France"", ""Nouvelle-Aquitaine"", ""Ha...",France,Nouvelle-Aquitaine,Haute-Vienne,Saint-Jouvent,"10 Maison Neuve, 87510 Saint-Jouvent France",45.961674,1.169131,Claimed,,#1 of 2 Restaurants in Saint-Jouvent,#1 of 2 places to eat in Saint-Jouvent,"Cheap Eats, French",€,,"Lunch, Dinner",French,,"Reservations, Seating, Wheelchair Accessible, ...",N,N,N,,,,,4.0,36.0,English,2.0,2.0,0.0,0.0,0.0,0.0,4.0,4.5,4.0,,
1,g10001637-d14975787,Le Saint Jouvent,"[""Europe"", ""France"", ""Nouvelle-Aquitaine"", ""Ha...",France,Nouvelle-Aquitaine,Haute-Vienne,Saint-Jouvent,"16 Place de l Eglise, 87510 Saint-Jouvent France",45.95704,1.20548,Unclaimed,,#2 of 2 Restaurants in Saint-Jouvent,#2 of 2 places to eat in Saint-Jouvent,Cheap Eats,€,,,,,,N,N,N,,,,,4.0,5.0,All languages,5.0,2.0,2.0,1.0,0.0,0.0,,,,,
2,g10002858-d4586832,Au Bout du Pont,"[""Europe"", ""France"", ""Centre-Val de Loire"", ""B...",France,Centre-Val de Loire,Berry,Rivarennes,"2 rue des Dames, 36800 Rivarennes France",46.635895,1.386133,Claimed,,#1 of 1 Restaurant in Rivarennes,#1 of 1 places to eat in Rivarennes,"Cheap Eats, French, European",€,,"Dinner, Lunch, Drinks","French, European",,"Reservations, Seating, Table Service, Wheelcha...",N,N,N,,,,,5.0,13.0,English,4.0,3.0,1.0,0.0,0.0,0.0,,,,,
3,g10002986-d3510044,Le Relais de Naiade,"[""Europe"", ""France"", ""Nouvelle-Aquitaine"", ""Co...",France,Nouvelle-Aquitaine,Correze,Lacelle,"9 avenue Porte de la Correze 19170, 19170 Lace...",45.64261,1.82446,Claimed,,#1 of 1 Restaurant in Lacelle,#1 of 1 places to eat in Lacelle,"Cheap Eats, French",€,,"Lunch, Dinner",French,,"Reservations, Seating, Serves Alcohol, Table S...",N,N,N,,,,,4.0,34.0,English,1.0,1.0,0.0,0.0,0.0,0.0,4.5,4.5,4.5,,
4,g10022428-d9767191,Relais Du MontSeigne,"[""Europe"", ""France"", ""Occitanie"", ""Aveyron"", ""...",France,Occitanie,Aveyron,Saint-Laurent-de-Levezou,"route du Montseigne, 12620 Saint-Laurent-de-Le...",44.20886,2.96047,Unclaimed,,#1 of 1 Restaurant in Saint-Laurent-de-Levezou,#1 of 1 places to eat in Saint-Laurent-de-Levezou,"Mid-range, French",€€-€€€,,"Lunch, Dinner",French,,"Reservations, Seating, Wheelchair Accessible, ...",N,N,N,,,,,4.5,11.0,All languages,11.0,4.0,7.0,0.0,0.0,0.0,4.5,4.5,4.5,,


In [11]:
spain = df_eda[df_eda["country"] == "Spain"]
spain.shape

(157479, 42)

In [14]:
df = AutoEDA()

In [15]:
df.explo_df(spain)

DataFrame Information:
<class 'pandas.core.frame.DataFrame'>
Index: 157479 entries, 320900 to 478378
Data columns (total 42 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   restaurant_link                    157479 non-null  object 
 1   restaurant_name                    157479 non-null  object 
 2   original_location                  157479 non-null  object 
 3   country                            157479 non-null  object 
 4   region                             157477 non-null  object 
 5   province                           127909 non-null  object 
 6   city                               54595 non-null   object 
 7   address                            157479 non-null  object 
 8   latitude                           155116 non-null  float64
 9   longitude                          155116 non-null  float64
 10  claimed                            157193 non-null  object 
 11  awards          

None


First 10 rows of the DataFrame:


Unnamed: 0,restaurant_link,restaurant_name,original_location,country,region,province,city,address,latitude,longitude,claimed,awards,popularity_detailed,popularity_generic,top_tags,price_level,price_range,meals,cuisines,special_diets,features,vegetarian_friendly,vegan_options,gluten_free,original_open_hours,open_days_per_week,open_hours_per_week,working_shifts_per_week,avg_rating,total_reviews_count,default_language,reviews_count_in_default_language,excellent,very_good,average,poor,terrible,food,service,value,atmosphere,keywords
320900,g10021880-d13763192,Taberna La Sacristia,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del Concejo 19, 41870 Aznalcollar Spain",37.51928,-6.26885,Unclaimed,,#4 of 5 Restaurants in Aznalcollar,#4 of 6 places to eat in Aznalcollar,Spanish,,,,Spanish,,,N,N,N,,,,,3.0,1.0,English,1.0,0.0,0.0,1.0,0.0,0.0,,,,,
320901,g10021880-d15758746,Tasca el Capricho,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del 28 de Febrero 9, 41870 Aznalcollar S...",37.52065,-6.26822,Unclaimed,,#3 of 5 Restaurants in Aznalcollar,#3 of 6 places to eat in Aznalcollar,Spanish,,,,Spanish,,,N,N,N,,,,,5.0,2.0,All languages,2.0,2.0,0.0,0.0,0.0,0.0,,,,,
320902,g10021880-d19332558,Bar Las Adelfas,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle Perdon N° 23 Capilla de La Cruz, 41870 A...",37.52428,-6.27144,Claimed,,#5 of 5 Restaurants in Aznalcollar,#5 of 6 places to eat in Aznalcollar,"Mediterranean, Spanish, Grill, Diner",,,"Breakfast, Lunch, Dinner, Brunch, Drinks","Mediterranean, Spanish, Grill, Diner, Dining bars",,,N,N,N,"{""Mon"": [], ""Tue"": [""19:30-23:45""], ""Wed"": [""1...",6.0,51.0,6.0,3.0,2.0,All languages,2.0,1.0,0.0,0.0,0.0,1.0,,,,,
320903,g10021880-d19468788,El Rincon nº 7,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del Concejo 60, 41870 Aznalcollar Spain",37.51714,-6.2686,Claimed,,#1 of 5 Restaurants in Aznalcollar,#1 of 6 places to eat in Aznalcollar,"Mid-range, Steakhouse, Cafe, Spanish",€€-€€€,€2-€18,"Lunch, Dinner, Drinks","Steakhouse, Cafe, Dining bars, Spanish",,,N,N,N,"{""Mon"": [], ""Tue"": [], ""Wed"": [], ""Thu"": [""20:...",4.0,23.0,7.0,5.0,18.0,All languages,18.0,17.0,1.0,0.0,0.0,0.0,,,,,
320904,g10021880-d19847377,Nuevo jacaranda,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,"Calle del 28 de Febrero 2, 41870 Aznalcollar S...",37.52088,-6.26844,Claimed,,,,,,,,,,Reservations,N,N,N,,,,,,,,,,,,,,,,,,
320905,g10021880-d23180773,Bar Restaurante El Mena,"[""Europe"", ""Spain"", ""Andalucia"", ""Province of ...",Spain,Andalucia,Province of Seville,,Calle de Guillermo Gutierrez Vidal 2 Frente A ...,37.522625,-6.268723,Claimed,,#2 of 5 Restaurants in Aznalcollar,#2 of 6 places to eat in Aznalcollar,"Mid-range, Cafe, Mediterranean, Spanish",€€-€€€,€8-€20,,"Dining bars, Cafe, Mediterranean, Spanish, Hea...",,,N,N,N,,,,,5.0,7.0,All languages,7.0,6.0,1.0,0.0,0.0,0.0,,,,,
320906,g10052043-d10454176,Tu Mateix Bar,"[""Europe"", ""Spain"", ""Catalonia"", ""Province of ...",Spain,Catalonia,Province of Barcelona,,"Calle Conca 21 Tda 1, 08026 Ullastrell Spain",41.41357,2.18,Unclaimed,,,,"Cheap Eats, Bar, Pizza, Cafe",€,,,"Bar, Pizza, Cafe, Fast food, Pub",,,N,N,N,,,,,,,,,,,,,,,,,,
320907,g10052043-d12414243,Casal M&M,"[""Europe"", ""Spain"", ""Catalonia"", ""Province of ...",Spain,Catalonia,Province of Barcelona,,"c/josep Fornells s/n Casal Del Poble, 08231 Ul...",41.52691,1.95851,Unclaimed,,#2 of 3 Restaurants in Ullastrell,#2 of 5 places to eat in Ullastrell,,,,"Breakfast, Lunch, Dinner",,,,N,N,N,"{""Mon"": [""08:00-20:00""], ""Tue"": [""08:00-23:00""...",7.0,105.0,7.0,5.0,4.0,All languages,4.0,3.0,1.0,0.0,0.0,0.0,,,,,
320908,g10052043-d23131396,La Panxa Del Bou,"[""Europe"", ""Spain"", ""Catalonia"", ""Province of ...",Spain,Catalonia,Province of Barcelona,,"Arquitecte Alsius 1, 08231 Ullastrell Spain",41.525303,1.956932,Unclaimed,,#3 of 3 Restaurants in Ullastrell,#3 of 5 places to eat in Ullastrell,"Mid-range, Steakhouse, Mediterranean, Barbecue",€€-€€€,€13-€50,"Breakfast, Lunch","Steakhouse, Mediterranean, Barbecue, Catalan",,,N,N,N,"{""Mon"": [""12:00-17:00""], ""Tue"": [""13:00-17:00""...",6.0,41.0,6.0,3.5,9.0,All languages,9.0,5.0,0.0,0.0,1.0,3.0,,,,,
320909,g10052043-d23236236,Casal D'ullastrell,"[""Europe"", ""Spain"", ""Catalonia"", ""Province of ...",Spain,Catalonia,Province of Barcelona,,"Carretera D'olesa, S/n, 08231 Ullastrell Spain",41.52775,1.958487,Unclaimed,,,,"Cheap Eats, Fast food, Mediterranean",€,,,"Fast food, Mediterranean",,,N,N,N,,,,,,,,,,,,,,,,,,



Last 10 rows of the DataFrame:


Unnamed: 0,restaurant_link,restaurant_name,original_location,country,region,province,city,address,latitude,longitude,claimed,awards,popularity_detailed,popularity_generic,top_tags,price_level,price_range,meals,cuisines,special_diets,features,vegetarian_friendly,vegan_options,gluten_free,original_open_hours,open_days_per_week,open_hours_per_week,working_shifts_per_week,avg_rating,total_reviews_count,default_language,reviews_count_in_default_language,excellent,very_good,average,poor,terrible,food,service,value,atmosphere,keywords
478369,g9862963-d14107896,Pizzeria Donatos,"[""Europe"", ""Spain"", ""Balearic Islands"", ""Major...",Spain,Balearic Islands,Majorca,Son Caliu,"Calle Jardiel Poncela s/n No Local 7, 07181 So...",39.52738,2.545634,Unclaimed,,,,"Mid-range, Italian, Mediterranean",€€-€€€,€5-€14,,"Italian, Mediterranean",,,N,N,N,,,,,,0.0,,,,,,,,,,,,
478370,g9862963-d15050503,Hang Zhou,"[""Europe"", ""Spain"", ""Balearic Islands"", ""Major...",Spain,Balearic Islands,Majorca,Son Caliu,"C. de Jardiel Poncela, 7 local 14, 07181 Son C...",,,Unclaimed,,#6 of 6 Restaurants in Son Caliu,#6 of 7 places to eat in Son Caliu,,,,"Lunch, Dinner",,,,N,N,N,"{""Mon"": [""12:00-23:30""], ""Tue"": [""18:00-23:30""...",6.0,57.0,6.0,5.0,3.0,English,3.0,3.0,0.0,0.0,0.0,0.0,,,,,
478371,g9862963-d17704414,Trattoria pizzeria Da Bruno,"[""Europe"", ""Spain"", ""Balearic Islands"", ""Major...",Spain,Balearic Islands,Majorca,Son Caliu,"Carrer de Roses Bermejo, 24 Local 1, 07181 Son...",39.52691,2.540814,Claimed,,#4 of 6 Restaurants in Son Caliu,#4 of 7 places to eat in Son Caliu,"Mid-range, Italian, Bar, Cafe",€€-€€€,€10-€50,"Dinner, Drinks","Italian, Bar, Cafe, Pub",,,N,N,N,"{""Mon"": [], ""Tue"": [""11:00-17:00"", ""19:00-22:3...",6.0,53.5,11.0,5.0,9.0,English,7.0,7.0,0.0,0.0,0.0,0.0,,,,,
478372,g9862963-d8426564,Archies Bar,"[""Europe"", ""Spain"", ""Balearic Islands"", ""Major...",Spain,Balearic Islands,Majorca,Son Caliu,"Calle Jardiel Poncela 7, 07181 Son Caliu, Calv...",39.52739,2.54574,Claimed,,#2 of 6 Restaurants in Son Caliu,#2 of 7 places to eat in Son Caliu,"Cheap Eats, Bar, British",€,€3-€10,"Drinks, Breakfast, Lunch, Dinner","Bar, British",,,N,N,N,"{""Mon"": [""10:00-01:00""], ""Tue"": [""10:00-01:00""...",7.0,105.0,7.0,4.5,27.0,English,25.0,19.0,3.0,1.0,0.0,2.0,5.0,4.5,4.5,,
478373,g9871604-d11777202,El Caserio de Tion,"[""Europe"", ""Spain"", ""Galicia"", ""Province of A ...",Spain,Galicia,Province of A Coruna,,"Lugar Os Chas 5, 15316, Coiros Spain",43.256844,-8.168913,Unclaimed,,#2 of 4 Restaurants in Coiros,#2 of 5 places to eat in Coiros,"Mid-range, Spanish, Vegan Options",€€-€€€,,"Dinner, Lunch",Spanish,Vegan Options,,N,Y,N,,,,,4.0,45.0,English,1.0,1.0,0.0,0.0,0.0,0.0,4.5,4.0,4.0,,
478374,g9871604-d13931661,Mesón La Parrillada,"[""Europe"", ""Spain"", ""Galicia"", ""Province of A ...",Spain,Galicia,Province of A Coruna,,"Salida de la autovia, Coiros Spain",,,Unclaimed,,#4 of 4 Restaurants in Coiros,#4 of 5 places to eat in Coiros,Spanish,,,,Spanish,,Reservations,N,N,N,,,,,3.5,9.0,All languages,9.0,5.0,1.0,0.0,1.0,2.0,,,,,
478375,g9871604-d8738469,Restaurante La Paz,"[""Europe"", ""Spain"", ""Galicia"", ""Province of A ...",Spain,Galicia,Province of A Coruna,,"Lg. Espenuca, 3, 15316, Coiros Spain",43.26089,-8.156289,Unclaimed,,#3 of 4 Restaurants in Coiros,#3 of 5 places to eat in Coiros,"Mid-range, Spanish",€€-€€€,€10-€100,"Lunch, Dinner",Spanish,,,N,N,N,,,,,4.0,33.0,All languages,33.0,12.0,11.0,4.0,2.0,4.0,4.0,4.0,4.0,,
478376,g9871604-d9812919,Parrillada Barral,"[""Europe"", ""Spain"", ""Galicia"", ""Province of A ...",Spain,Galicia,Province of A Coruna,,"Rua Santa Maria De Ois, S/n Parada N 9, 15316,...",43.234463,-8.13047,Claimed,"Travellers' Choice, Certificate of Excellence ...",#1 of 4 Restaurants in Coiros,#1 of 5 places to eat in Coiros,"Mid-range, Steakhouse, Barbecue, Spanish",€€-€€€,€12-€20,,"Steakhouse, Barbecue, Spanish",Gluten Free Options,,N,N,Y,"{""Mon"": [""07:00-23:59""], ""Tue"": [""07:00-23:59""...",6.0,100.4,6.0,4.0,77.0,All languages,77.0,41.0,19.0,7.0,3.0,7.0,4.5,4.0,4.0,,
478377,g9871607-d20375801,BP,"[""Europe"", ""Spain"", ""Galicia"", ""Province of A ...",Spain,Galicia,Province of A Coruna,,"Carretera Nacional Vi Km. 567, 15316 Coiros de...",43.24839,-8.161417,Unclaimed,,,,,,,"Breakfast, Lunch, Dinner",,,,N,N,N,"{""Mon"": [""07:00-23:00""], ""Tue"": [""07:00-23:00""...",7.0,112.0,7.0,,0.0,,,,,,,,,,,,
478378,g9984094-d15652066,Restaurante Fito Mar,"[""Europe"", ""Spain"", ""Asturias"", ""Caravia Munic...",Spain,Asturias,Caravia Municipality,Playa de La Espasa,"Calle Espasa, La Bajo, 33343 Playa de La Espas...",43.47393,-5.211904,Unclaimed,,#1 of 1 Restaurant in Playa de La Espasa,#1 of 1 places to eat in Playa de La Espasa,Spanish,,,"Dinner, Lunch",Spanish,,Table Service,N,N,N,"{""Mon"": [""10:00-23:59""], ""Tue"": [""10:00-23:59""...",7.0,96.383333,7.0,3.0,17.0,All languages,17.0,2.0,8.0,2.0,2.0,3.0,,,,,



Statistical description of the DataFrame (numeric):


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
latitude,155116.0,39.175707,3.715078,27.64031,37.60078,40.299978,41.50419,43.82233
longitude,155116.0,-3.183414,4.797339,-18.105625,-5.159328,-3.529207,0.12014,5.9956
open_days_per_week,75730.0,6.297108,0.995965,1.0,6.0,7.0,7.0,7.0
open_hours_per_week,75730.0,66.489934,31.677816,0.0,42.0,64.5,90.0,168.0
working_shifts_per_week,75730.0,7.851169,2.755509,1.0,6.0,7.0,9.0,15.0
avg_rating,142668.0,3.959886,0.758236,1.0,3.5,4.0,4.5,5.0
total_reviews_count,148611.0,98.696227,269.714383,0.0,6.0,24.0,88.0,33731.0
reviews_count_in_default_language,142988.0,29.361646,104.581429,1.0,2.0,6.0,20.0,8337.0
excellent,142988.0,16.810711,67.755788,0.0,1.0,3.0,9.0,4790.0
very_good,142988.0,6.865003,24.316725,0.0,0.0,1.0,5.0,2377.0



Statistical description of the DataFrame (categorical):


Unnamed: 0,count,unique,top,freq
restaurant_link,157479,157479,g10021880-d13763192,1
restaurant_name,157479,134666,Burger King,636
original_location,157479,6702,"[""Europe"", ""Spain"", ""Community of Madrid"", ""Ma...",12134
country,157479,1,Spain,157479
region,157477,19,Andalucia,29562
province,127909,135,Province of Barcelona,18952
city,54595,1732,Madrid,12134
address,157479,152311,Barcelona Spain,17
claimed,157193,2,Unclaimed,88094
awards,35870,510,"Travellers' Choice, Certificate of Excellence ...",3123



Count of null values per column:


restaurant_link                           0
restaurant_name                           0
original_location                         0
country                                   0
region                                    2
province                              29570
city                                 102884
address                                   0
latitude                               2363
longitude                              2363
claimed                                 286
awards                               121609
popularity_detailed                   14501
popularity_generic                    14842
top_tags                              15633
price_level                           40671
price_range                          109167
meals                                 70765
cuisines                              22514
special_diets                        115423
features                             106563
vegetarian_friendly                       0
vegan_options                   


Percentage of null values per column (only columns with nulls):


province                             18.78
city                                 65.33
latitude                              1.50
longitude                             1.50
claimed                               0.18
awards                               77.22
popularity_detailed                   9.21
popularity_generic                    9.42
top_tags                              9.93
price_level                          25.83
price_range                          69.32
meals                                44.94
cuisines                             14.30
special_diets                        73.29
features                             67.67
original_open_hours                  51.91
open_days_per_week                   51.91
open_hours_per_week                  51.91
working_shifts_per_week              51.91
avg_rating                            9.41
total_reviews_count                   5.63
default_language                      9.20
reviews_count_in_default_language     9.20
excellent  


Rows with all values as null:
There is no rows with all values as null.

Count of duplicate rows:


0

In [18]:
spain.duplicated().sum()

0

In [17]:
spain.to_csv("spain.csv")