# Data Exploration

This part focuses on the process of initial Data Exploration and Data Cleaning

In [1]:
# Import Libraries
import pandas as pd

In [2]:
#The formula for reading back a file that was previously exported to csv
df = pd.read_csv('Seblak_Tokped.csv')

In [3]:
#The formula for outputting information from the database
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      90 non-null     int64  
 1   Name            90 non-null     object 
 2   Price           90 non-null     object 
 3   Seller          90 non-null     object 
 4   Cities          90 non-null     object 
 5   Product sold    84 non-null     object 
 6   Product rating  82 non-null     float64
dtypes: float64(1), int64(1), object(5)
memory usage: 5.0+ KB


Based on the results of the data reading above, there is information that all data types are objects, so they must be converted to the correct data type in order to be processed.

In [4]:
# Formula to display columns in the dataframe
df.columns

Index(['Unnamed: 0', 'Name', 'Price', 'Seller', 'Cities', 'Product sold',
       'Product rating'],
      dtype='object')

In [5]:
# Formula to find Missing Value
df.isnull().sum()

Unnamed: 0        0
Name              0
Price             0
Seller            0
Cities            0
Product sold      6
Product rating    8
dtype: int64

In [6]:
# Formula to find duplicate data
df.duplicated().sum()

0

# Data Cleaning

In [7]:
df

Unnamed: 0.1,Unnamed: 0,Name,Price,Seller,Cities,Product sold,Product rating
0,0,Gelifood Combo 3pcs Seblak Instan Kerupuk Mawa...,Rp39.000,Lidigeli,Kab. Garut,30+ terjual,4.6
1,1,Kylafood Seblak Cup,Rp11.000,Kylafood Jakarta,Jakarta Selatan,60+ terjual,4.9
2,2,Seblak Instan Ceu Nthien Khas Bandung Rasana N...,Rp15.000,Central Seblak Nusantara,Tangerang Selatan,1rb+ terjual,4.9
3,3,SEBLAK INSTAN MOMMY,Rp13.500,Rav Jaya Display,Kab. Tangerang,1rb+ terjual,4.8
4,4,Termurah Kerupuk Mentah-Kerupuk Seblak-Kerupuk...,Rp6.000,Djuragan Kerupuk,Surabaya,80+ terjual,5.0
...,...,...,...,...,...,...,...
85,254,"termurah cuanki lidah khas garut, toping baso ...",Rp20.500,sansay327,Tangerang Selatan,30+ terjual,5.0
86,285,kerupuk seblak pedas 1 kg / kerupuk seblak,Rp35.625,Adila_Putri_Snack,Kab.Ciamis,100+ terjual,4.8
87,287,cuanki lidah 50 pcs enak toping seblak baso aci,Rp12.999,CuankiCulametan,Kab. Garut,250+ terjual,4.9
88,288,PANGSIT SEBLAK BUMBU BASAH,Rp8.500,RIKI STORE 97,Kab. Bogor,9 terjual,5.0


In [8]:
#The formula to remove Rp inside Price column
df['Price']= df['Price'].str.replace('Rp', '')
#The formula for removing the word Terjual from the Product sold column
df['Product sold']= df['Product sold'].str.replace('terjual', '')

In [9]:
#Formula to change the word rb to thousand(000) and Formula to remove the + sign in the Product sold column
df['Product sold']= df['Product sold'].str.replace('rb', '000')
df['Product sold']= df['Product sold'].str.replace('+', '')

In [10]:
#The formula to convert None to non values
df['Product sold'] = df['Product sold'].replace('None', pd.NA)

In [11]:
#The formula for removing spaces in the Product sold column
df['Product sold']= df['Product sold'].str.replace(' ', '')

In [12]:
#The formula to convert non values to 0 and change the data type from object to integer
df['Product sold'] = df['Product sold'].fillna(0).astype(int)

In [13]:
#The formula to convert non values to 0 and change the data type from object to float
df['Product rating']= df['Product rating'].fillna(0).astype(float)

In [14]:
# Formula to display Dataframe
df

Unnamed: 0.1,Unnamed: 0,Name,Price,Seller,Cities,Product sold,Product rating
0,0,Gelifood Combo 3pcs Seblak Instan Kerupuk Mawa...,39.000,Lidigeli,Kab. Garut,30,4.6
1,1,Kylafood Seblak Cup,11.000,Kylafood Jakarta,Jakarta Selatan,60,4.9
2,2,Seblak Instan Ceu Nthien Khas Bandung Rasana N...,15.000,Central Seblak Nusantara,Tangerang Selatan,1000,4.9
3,3,SEBLAK INSTAN MOMMY,13.500,Rav Jaya Display,Kab. Tangerang,1000,4.8
4,4,Termurah Kerupuk Mentah-Kerupuk Seblak-Kerupuk...,6.000,Djuragan Kerupuk,Surabaya,80,5.0
...,...,...,...,...,...,...,...
85,254,"termurah cuanki lidah khas garut, toping baso ...",20.500,sansay327,Tangerang Selatan,30,5.0
86,285,kerupuk seblak pedas 1 kg / kerupuk seblak,35.625,Adila_Putri_Snack,Kab.Ciamis,100,4.8
87,287,cuanki lidah 50 pcs enak toping seblak baso aci,12.999,CuankiCulametan,Kab. Garut,250,4.9
88,288,PANGSIT SEBLAK BUMBU BASAH,8.500,RIKI STORE 97,Kab. Bogor,9,5.0


In [15]:
#The formula to change the data type from object to float
df['Price']= df['Price'].astype(float)

In [16]:
# Formula to check the information within dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Unnamed: 0      90 non-null     int64  
 1   Name            90 non-null     object 
 2   Price           90 non-null     float64
 3   Seller          90 non-null     object 
 4   Cities          90 non-null     object 
 5   Product sold    90 non-null     int32  
 6   Product rating  90 non-null     float64
dtypes: float64(2), int32(1), int64(1), object(3)
memory usage: 4.7+ KB


In [17]:
# Formula to drop Unnamed Column
df.drop(columns='Unnamed: 0', inplace=True)

In [18]:
# Formula to check the information within dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            90 non-null     object 
 1   Price           90 non-null     float64
 2   Seller          90 non-null     object 
 3   Cities          90 non-null     object 
 4   Product sold    90 non-null     int32  
 5   Product rating  90 non-null     float64
dtypes: float64(2), int32(1), object(3)
memory usage: 4.0+ KB


In [19]:
# Formula to export to csv file
df.to_csv('Seblak_tokped_clean.csv')