# Exploratory data analysis (EDA)
Exploratory data analysis is an approach of analyzing data sets to summarize their main characteristics, often using statistical graphics and other data visualization methods.

### Importing scripts folder


In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('../scripts')))

### Loading Libraries we will use in the project 


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
sns.set()

### Importing scripts & modules 

In [3]:
from helper import MyHelper
import utility
import plots

### Loading the data

In [6]:
CSV_PATH = "../Data\\test.csv"

In [4]:
# the class has utility function we are going to use
Helper = MyHelper()

In [7]:
# we use our helper function to read csv data
# we treate values like 'n/a', 'na', 'undefined' as missing values

df = Helper.read_csv(CSV_PATH)

>> file read as csv


In [8]:
df.head()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


# Exploring the data

### Getting the basic info about the dataset

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             41088 non-null  int64  
 1   Store          41088 non-null  int64  
 2   DayOfWeek      41088 non-null  int64  
 3   Date           41088 non-null  object 
 4   Open           41077 non-null  float64
 5   Promo          41088 non-null  int64  
 6   StateHoliday   41088 non-null  object 
 7   SchoolHoliday  41088 non-null  int64  
dtypes: float64(1), int64(5), object(2)
memory usage: 2.2+ MB


### Getting the basic description of the dataset

In [10]:
df.describe()

Unnamed: 0,Id,Store,DayOfWeek,Open,Promo,SchoolHoliday
count,41088.0,41088.0,41088.0,41077.0,41088.0,41088.0
mean,20544.5,555.899533,3.979167,0.854322,0.395833,0.443487
std,11861.228267,320.274496,2.015481,0.352787,0.489035,0.496802
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,10272.75,279.75,2.0,1.0,0.0,0.0
50%,20544.5,553.5,4.0,1.0,0.0,0.0
75%,30816.25,832.25,6.0,1.0,1.0,1.0
max,41088.0,1115.0,7.0,1.0,1.0,1.0


In [11]:
print(f" There are {df.shape[0]} rows and {df.shape[1]} columns")

 There are 41088 rows and 8 columns


### Checking for duplicates

In [12]:
df.duplicated()

0        False
1        False
2        False
3        False
4        False
         ...  
41083    False
41084    False
41085    False
41086    False
41087    False
Length: 41088, dtype: bool

In [13]:
#Droping if there is any duplicates in the dataset
df.drop_duplicates()

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0
...,...,...,...,...,...,...,...,...
41083,41084,1111,6,2015-08-01,1.0,0,0,0
41084,41085,1112,6,2015-08-01,1.0,0,0,0
41085,41086,1113,6,2015-08-01,1.0,0,0,0
41086,41087,1114,6,2015-08-01,1.0,0,0,0


### Checking the null counts

In [14]:
df.isnull().sum()

Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64

In [17]:
def fix_missing_ffill(df, col):
    df[col] = df[col].fillna(method='ffill')
    return df[col]
df['Open'] = fix_missing_ffill(df, 'Open')

In [18]:
df.isnull().sum()


Id               0
Store            0
DayOfWeek        0
Date             0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

### Checking for missing values

In [19]:
#The percent_missing function checkes any missing values and convert it into percentage 

utility.percent_missing(df)

The dataset contains 0.0 % missing values.


In [20]:
df.dtypes

Id                 int64
Store              int64
DayOfWeek          int64
Date              object
Open             float64
Promo              int64
StateHoliday      object
SchoolHoliday      int64
dtype: object

In [21]:
df['Date'] = pd.to_datetime(df['Date'])

In [22]:
def to_int(x):
    return int(x)
# use function to convert floats to int

df['Open'] = df['Open'].apply(to_int)

In [23]:
df.dtypes

Id                        int64
Store                     int64
DayOfWeek                 int64
Date             datetime64[ns]
Open                      int64
Promo                     int64
StateHoliday             object
SchoolHoliday             int64
dtype: object

In [24]:
df.head(2)

Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1,1,0,0
1,2,3,4,2015-09-17,1,1,0,0


In [25]:
CLEANED_CSV_PATH = "../new_data\\testnew.csv"
Helper.save_csv(df, CLEANED_CSV_PATH)

>> File Successfully Saved.!!!


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1,1,0,0
1,2,3,4,2015-09-17,1,1,0,0
2,3,7,4,2015-09-17,1,1,0,0
3,4,8,4,2015-09-17,1,1,0,0
4,5,9,4,2015-09-17,1,1,0,0
...,...,...,...,...,...,...,...,...
41083,41084,1111,6,2015-08-01,1,0,0,0
41084,41085,1112,6,2015-08-01,1,0,0,0
41085,41086,1113,6,2015-08-01,1,0,0,0
41086,41087,1114,6,2015-08-01,1,0,0,0
