<a href="https://colab.research.google.com/github/ChitraChaudhari/GC_DataEngineering_Bootcamp/blob/main/EDA/More_on_DataFrames_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#More on DataFrames

#Problem
This is an open dataset from the City of Detroit.  The data is already imported into a DataFrame using Pandas.
Please complete the below exercises



In [1]:
import pandas as pd

detroit_demolitions = pd.read_csv('https://storage.googleapis.com/mbcc/datasets/Detroit_Demolitions_withColumns.csv',
                        parse_dates=[4])
detroit_demolitions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12227 entries, 0 to 12226
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   Address              12227 non-null  object        
 1   Parcel ID            12227 non-null  object        
 2   Contractor Name      12227 non-null  object        
 3   Price                12227 non-null  object        
 4   Demolition Date      12227 non-null  datetime64[ns]
 5   Commercial Building  12227 non-null  object        
 6   Council_District     12227 non-null  int64         
 7   Latitude             12227 non-null  float64       
 8   Longitude            12227 non-null  float64       
 9   Location             12227 non-null  object        
 10  Neighborhood         12227 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(7)
memory usage: 1.0+ MB


#Exercise 1
This dataset has column names that have spaces. Clean the column names by **replacing** all **spaces** with an **underscore** '_'  also convert them to **lower case**

In [2]:
#Your code goes here
detroit_demolitions.columns = detroit_demolitions.columns.str.replace(" ", "_").str.lower()

detroit_demolitions.columns

Index(['address', 'parcel_id', 'contractor_name', 'price', 'demolition_date',
       'commercial_building', 'council_district', 'latitude', 'longitude',
       'location', 'neighborhood'],
      dtype='object')

In [3]:
#TestCase
clean_columns = pd.Index(['address', 'parcel_id', 'contractor_name', 'price', 'demolition_date',
       'commercial_building', 'council_district', 'latitude', 'longitude',
       'location', 'neighborhood'])
pd.testing.assert_index_equal(detroit_demolitions.columns, clean_columns)
print("Test passed!")

Test passed!


##Exercise 2:
By running the head function you notice that the 'price' column has \$ symbol and hence is assigned an Object type instead of an integer. 

Strip the $ symbol and convert the datatype to numeric.

In [4]:
detroit_demolitions.head()

Unnamed: 0,address,parcel_id,contractor_name,price,demolition_date,commercial_building,council_district,latitude,longitude,location,neighborhood
0,19493 Concord,15011348.0,MCM,$12530.82,2014-08-07,No,3,42.438562,-83.033366,"19493 Concord\n(42.438562, -83.033366)",Nortown
1,10 W Arizona,1004588.0,Rickman Enterprise Group,$13630.00,2016-03-08,No,2,42.419302,-83.10232,"10 W Arizona\n(42.419302, -83.10232)",Grixdale Farms
2,100 Marston,1002314.0,ABC Demolition,$14300.00,2014-07-25,No,5,42.375327,-83.074438,"100 Marston\n(42.375327, -83.074438)",North End
3,1000 Baldwin,17011063.0,DMC Consultants,$18687.40,2014-04-16,No,5,42.352273,-82.999264,"1000 Baldwin\n(42.352273, -82.999264)",Islandview
4,10005 Mansfield,22057965.0,"Adamo Group, Inc.",$8950.00,2016-07-12,No,7,42.369223,-83.203769,"10005 Mansfield\n(42.369223, -83.203769)",Joy Community


In [5]:
#Your code goes here
detroit_demolitions['price']=detroit_demolitions['price'].str.lstrip('$').astype(float)

In [6]:
detroit_demolitions['price'][1]

13630.0

In [7]:
#TestCase
assert(detroit_demolitions['price'].dtype == float)
print("Test passed!")

Test passed!


## Exercise 3

Drop column 'location'; it is redundant as the same information is available in address, latitude, longitude columns.



In [8]:
#Your code goes here
detroit_demolitions.drop(axis=1,columns='location',inplace=True)
detroit_demolitions.iloc[1]

address                            10 W Arizona
parcel_id                             01004588.
contractor_name        Rickman Enterprise Group
price                                   13630.0
demolition_date             2016-03-08 00:00:00
commercial_building                          No
council_district                              2
latitude                              42.419302
longitude                             -83.10232
neighborhood                     Grixdale Farms
Name: 1, dtype: object

In [9]:
#TestCase
try:
  detroit_demolitions['location']
except KeyError:
  print("Test passed!")

Test passed!


# Exercise 4
Check if there are any duplicate records

In [10]:
# Your code goes here
if(detroit_demolitions.address.nunique() == detroit_demolitions.shape[0]):
  print("No duplicate records")
else:
  print("there are some duplicate records")

No duplicate records


In [11]:
detroit_demolitions.duplicated().value_counts()
 

False    12227
dtype: int64

In [12]:
detroit_demolitions.duplicated().unique()

array([False])

In [13]:
detroit_demolitions.duplicated().nunique()

1

##Exercise 5

Get all demolitions that happened on '2014-08-07' that were carried out by contractors whose name contain 'CM'

In [14]:
result = detroit_demolitions[(detroit_demolitions['demolition_date'] == '2014-08-07') & (detroit_demolitions['contractor_name'].str.contains('CM'))]


In [15]:
result['demolition_date'].value_counts()

2014-08-07    15
Name: demolition_date, dtype: int64

In [16]:
#TestCase
assert(result['demolition_date'].value_counts()[0] == 15)
print("Test passed!")

Test passed!


#Exercise 6
Segment the price column by creating another column called 'cost_group' in that you label all demolitions below 5,000 as 'below 5k' , and all demolitions between 5,000 to 10,000 as '5k to 10k', and demolitions between 10,000 to 25,000 as '10k to 25k' and the rest as 'high-cost'

In [17]:
#Your code goes here
import numpy as np
segments = [-1,4999,9999,24999,np.inf]
labels = ['below 5k',' 5k to 10k','10k o 25k','high-cost']
cost_group = pd.cut(detroit_demolitions['price'],segments,labels=labels)
detroit_demolitions['cost_group'] = cost_group


In [18]:
detroit_demolitions.head()

Unnamed: 0,address,parcel_id,contractor_name,price,demolition_date,commercial_building,council_district,latitude,longitude,neighborhood,cost_group
0,19493 Concord,15011348.0,MCM,12530.82,2014-08-07,No,3,42.438562,-83.033366,Nortown,10k o 25k
1,10 W Arizona,1004588.0,Rickman Enterprise Group,13630.0,2016-03-08,No,2,42.419302,-83.10232,Grixdale Farms,10k o 25k
2,100 Marston,1002314.0,ABC Demolition,14300.0,2014-07-25,No,5,42.375327,-83.074438,North End,10k o 25k
3,1000 Baldwin,17011063.0,DMC Consultants,18687.4,2014-04-16,No,5,42.352273,-82.999264,Islandview,10k o 25k
4,10005 Mansfield,22057965.0,"Adamo Group, Inc.",8950.0,2016-07-12,No,7,42.369223,-83.203769,Joy Community,5k to 10k


In [19]:
#TestCase
assert(detroit_demolitions['cost_group'].value_counts()['below 5k'] == 138)
print("Test passed!")

Test passed!


In [20]:
#Your code goes here
result = detroit_demolitions['demolition_date'].groupby(detroit_demolitions['contractor_name'])
result.size()

contractor_name
1 Way Service                             83
313 Construction, LLC                     40
ABC Demolition                           446
Able Demolition                         1260
Adamo Group, Inc.                       2866
BBEK Environmental                         1
Berkshire Development, Inc.               13
Blackstar                                  1
Blue Star                                379
Brown & Glo Wrecking                      23
Brown Environmental Construction         265
DMC Consultants                         1388
Den-Man Contractors                       72
Direct Construction Services, LLC         93
Dore & Associates                         62
Esso Wrecking Co.                        217
Farrow Group                             236
Futurenet Group                           88
GLO WRECKING CO                           18
Homrich                                 2342
Jenkins Construction                      46
Joy Construction                       

In [21]:
result = detroit_demolitions.groupby(['demolition_date','contractor_name'])
result.size()

demolition_date  contractor_name         
2014-01-02       Den-Man Contractors         1
2014-01-06       Den-Man Contractors         2
                 Futurenet Group             2
2014-01-07       Futurenet Group             1
2014-01-13       Futurenet Group             1
                                            ..
2017-09-07       DMC Consultants             2
                 Den-Man Contractors         2
                 Rickman Enterprise Group    2
2017-09-08       Able Demolition             3
                 DMC Consultants             2
Length: 3474, dtype: int64

In [22]:
result = detroit_demolitions.groupby(['demolition_date'])
result.groups.keys()

dict_keys([Timestamp('2014-01-02 00:00:00'), Timestamp('2014-01-06 00:00:00'), Timestamp('2014-01-07 00:00:00'), Timestamp('2014-01-13 00:00:00'), Timestamp('2014-01-14 00:00:00'), Timestamp('2014-01-15 00:00:00'), Timestamp('2014-01-16 00:00:00'), Timestamp('2014-01-17 00:00:00'), Timestamp('2014-01-21 00:00:00'), Timestamp('2014-01-22 00:00:00'), Timestamp('2014-01-23 00:00:00'), Timestamp('2014-01-24 00:00:00'), Timestamp('2014-01-31 00:00:00'), Timestamp('2014-02-03 00:00:00'), Timestamp('2014-02-04 00:00:00'), Timestamp('2014-02-07 00:00:00'), Timestamp('2014-02-12 00:00:00'), Timestamp('2014-02-14 00:00:00'), Timestamp('2014-02-17 00:00:00'), Timestamp('2014-02-18 00:00:00'), Timestamp('2014-02-19 00:00:00'), Timestamp('2014-02-20 00:00:00'), Timestamp('2014-02-21 00:00:00'), Timestamp('2014-02-24 00:00:00'), Timestamp('2014-02-25 00:00:00'), Timestamp('2014-02-26 00:00:00'), Timestamp('2014-02-27 00:00:00'), Timestamp('2014-03-04 00:00:00'), Timestamp('2014-03-07 00:00:00'), Tim

In [23]:
result.groups

{2014-01-02 00:00:00: [3807], 2014-01-06 00:00:00: [1081, 2430, 8191, 9353], 2014-01-07 00:00:00: [3958], 2014-01-13 00:00:00: [7496], 2014-01-14 00:00:00: [3459], 2014-01-15 00:00:00: [754, 11828], 2014-01-16 00:00:00: [461, 751, 759, 4972], 2014-01-17 00:00:00: [217, 7677, 11274], 2014-01-21 00:00:00: [645, 1713, 2361, 10347], 2014-01-22 00:00:00: [1214, 10356, 10767, 11550], 2014-01-23 00:00:00: [10348], 2014-01-24 00:00:00: [2550, 11946], 2014-01-31 00:00:00: [8558, 9598], 2014-02-03 00:00:00: [220, 859, 9740], 2014-02-04 00:00:00: [2333, 11870], 2014-02-07 00:00:00: [568], 2014-02-12 00:00:00: [6060], 2014-02-14 00:00:00: [612, 7771, 8835], 2014-02-17 00:00:00: [988], 2014-02-18 00:00:00: [479, 603], 2014-02-19 00:00:00: [1584, 4186, 11993], 2014-02-20 00:00:00: [249, 894, 2310], 2014-02-21 00:00:00: [1992], 2014-02-24 00:00:00: [2323, 9884], 2014-02-25 00:00:00: [2925, 3050], 2014-02-26 00:00:00: [2156, 10349], 2014-02-27 00:00:00: [106, 9675], 2014-03-04 00:00:00: [6192, 10345, 

In [24]:
type(result.groups)

pandas.io.formats.printing.PrettyDict

In [25]:
result = detroit_demolitions.value_counts('demolition_date')
result

demolition_date
2014-07-30    60
2014-08-06    57
2016-06-20    55
2014-09-03    50
2014-09-08    50
              ..
2017-05-26     1
2014-07-12     1
2016-01-09     1
2017-01-30     1
2014-01-02     1
Length: 954, dtype: int64

In [26]:
result = detroit_demolitions.groupby('contractor_name')
result.groups['MCM']

Int64Index([    0,    51,    64,    73,    79,    81,   190,   210,   212,
              295,
            ...
            11759, 11784, 11785, 11790, 11801, 11806, 11823, 11824, 11826,
            11876],
           dtype='int64', length=334)

In [27]:
result=detroit_demolitions[detroit_demolitions['demolition_date']=='2014-08-07'].groupby('contractor_name')
result.size()

contractor_name
Able Demolition                      1
Adamo Group, Inc.                   12
Blue Star                            3
Brown Environmental Construction     2
DMC Consultants                      5
Homrich                             10
MCM                                 15
dtype: int64

In [28]:
result.size()['MCM']


15

In [29]:
result = detroit_demolitions[detroit_demolitions['demolition_date'] == '2014-08-07'].groupby(detroit_demolitions['contractor_name'] =='MCM')
print(result.size())
result['demolition_date'].value_counts()[1]

contractor_name
False    33
True     15
dtype: int64


demolition_date
2014-08-07    15
Name: demolition_date, dtype: int64

In [30]:
result = detroit_demolitions[detroit_demolitions['contractor_name'] =='MCM'].groupby(detroit_demolitions['demolition_date'] == '2014-08-07')
print(result.size())
result['demolition_date'].value_counts()[1]

demolition_date
False    319
True      15
dtype: int64


demolition_date
2014-08-07    15
Name: demolition_date, dtype: int64