In [7]:
#importing the required packages
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")   #optional to ignore the warnings 

In [24]:
#reading the data
#data = pd.read_csv(r"C:\Users\anjit\Documents\cars.csv")

**Problem Statement**

Exploratory Data Analysis, or EDA, is an important step in any Data Analysis or Data Science project. EDA is the process of investigating the dataset to discover patterns, and anomalies (outliers), and form hypotheses based on our understanding of the dataset.EDA involves generating summary statistics for numerical data in the dataset and creating various graphical representations to understand the data better.

Being one of the most important step it is also a very lengthy process. The creation of plots takes a decent amount of time.Automating the process of plotting the graphs could save a lot of time in the EDA process and reduce the effort of a data scientist significantly.

**Objective**

We intend to solve this problem by creating a function 'Graphs' which takes the dataframe data, list of required columns and the directory ( to store the plots generated). The function separates the data frame passed into two new dataframes with numerical and categorical columns. 
The output of the function stores histogram and boxplot for numerical columns as a single plot and a bar graph for categorical columns.

Seaborn package have been used to plot the Histograms and Barplots for each of the numerical dataframe.


**Expectation**

The function that we have created should automate the process of plotting the graphs and save the resultant graphs as a png file in a specified directory. We must get a boxplot and a histogram for all columns with numeric values and a barplot for all columns with categorical values. The histogram and boxplot should be saved in one png file as subplots. If the directory is not specified then the plots must be saved in the current working directory.

In [8]:
#The function Graphs takes the dataframe data, list of required columns and the directory ( to store the plots generated) 
#and outputs a histogram and boxplot for numerical columns and a bar graph for categorical columns.
#The entire column list of the dataframe and the current directory are passed as default parameters 
#for the cols and directory arguments respectively
def Graphs(data,cols=None,directory = os.getcwd()):
    if cols==None:
        cols=data.columns
    #separate the numerical columns into dataframe dt_numerics and the categorical columns into dataframe dt_objs
    df_numerics = data[cols].select_dtypes('number')
    df_objs = data[cols].select_dtypes('object')
    
   #for each of the columns of dt_numerics dataframe, we are plotting boxplot and a histogram
   #combine the two plots and store it in the directory path being passed.
   #if no path is passed, by default the plot is stored in the current directory

    for col in df_numerics:
        #each of the plot will be stored with file name as the columnname with file format png
        my_file = col + ".png"
        
        #merge the two plots using 'subplot', we have two rows in the plot (one for boxplot and another for histogram)
        #sharex parameter is set to true as the axis will be shared amoung all subplots 
        #and the grid has been created with the required height ratios
        f, (ax_box, ax_hist) = plt.subplots(2, sharex=True, 
                                    gridspec_kw={"height_ratios": (.95, .95)})
        #create a boxplot using the seaborn package for each of the columns passed and set the title and x and y labels
        sns.boxplot(df_numerics[col], ax=ax_box)
        #setting the title
        ax_box.set_title('Boxplot and Histogram of %s'%col,fontdict= { 'fontsize': 14, 'fontweight':'bold'})
        plt.xlabel(col,fontsize =12)
        
        #create a histogram using the seaborn package for each of the columns passed and set the title and x and y labels
        sns.histplot(df_numerics[col], ax=ax_hist)
        plt.xlabel(col,fontsize =12)
        plt.ylabel("   Frequency   ",fontsize =12)

        #save the figure in the directory with filename of format <columnname>.png
        plt.savefig(os.path.join(directory, my_file),format="png" )  
        plt.close()
    
    #for each of the columns of dt_objs dataframe, we are plotting barplot
    for col in df_objs:
        #the plot will be stored with file name as the columnname with file format png
        my_file = col + ".png"
        #plot the bar graph with counts as y-axis, 
        #limiting the number of bars displayed to 20 (to avoid clutter), specify the color and fontsize of the bar plots
        df_objs[col].value_counts().sort_values()[:20].plot(kind ='bar', figsize=(8,10), color="coral", fontsize = 13)
        #define the x-label , y-label and the title for the barplot
        plt.xlabel(col,fontsize =12)
        plt.ylabel("%s Counts"%col,fontsize =12)
        plt.title("Barplot of %s"%col,fontsize =16)
        
        #save the figure in the directory with filename of format <columnname>.png
        plt.savefig(os.path.join(directory, my_file))
        plt.close()

In [36]:
#Function call for Graphs takes the dataframe data, list of required columns and the directory ( to store the plots generated) 
#and outputs a histogram and boxplot for numerical columns and a bar graph for categorical columns.
#Graphs(data,directory = "C:\\Users\\anjit\\Documents\\graphs")