# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [1]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [6]:
# set the file path
csvpath = Path("../../Resources/stock_data.csv")

# create a Pandas dataframe from a csv file
data_frame = pd.read_csv(csvpath)

### Identify the number of rows and columns (shape) in the DataFrame.

In [9]:
data_frame.shape

(504, 14)

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [21]:
data_frame.sample(5)

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
127,STZ,Constellation Brands,Consumer Staples,208.73,30.92,0.971282,8.71,229.5,152.01,41697450000.0,3033300000.0,5.145596,5.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
463,URI,"United Rentals, Inc.",Industrials,161.99,16.33,0.0,15.72,189.0,100.621,14654950000.0,2760000000.0,2.178262,4.53,http://www.sec.gov/cgi-bin/browse-edgar?action...
309,MU,Micron Technology,Information Technology,40.0,9.01,0.0,4.36,49.89,22.64,48576790000.0,12541000000.0,2.191281,2.02,http://www.sec.gov/cgi-bin/browse-edgar?action...
294,MMC,Marsh & McLennan,Financials,79.31,20.23,1.825706,2.87,86.54,69.33,41819440000.0,3236000000.0,2.984894,5.85,http://www.sec.gov/cgi-bin/browse-edgar?action...
490,WHR,Whirlpool Corp.,Consumer Discretionary,164.95,11.77,2.565299,4.51,202.99,158.8,12177920000.0,1842000000.0,0.580648,2.57,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [12]:
data_frame.count()

symbol                504
name                  502
sector                501
price                 500
price_per_earnings    497
dividend_yield        499
earnings_per_share    498
52_week_low           500
52_week_high          500
market_cap            500
ebitda                492
price_per_sales       500
price_per_book        492
sec_filings           500
dtype: int64

### Identify nulls records

In [13]:
data_frame.isnull()

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,True,True,True,True,True,True,True,True,True,True,True
2,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,False,False,False,False,False,False,False,False,False,False,False,False,False,False
500,False,False,False,False,False,False,False,False,False,False,False,False,False,False
501,False,False,False,False,False,False,False,False,False,False,False,False,False,False
502,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### Drop Null Records

In [14]:
data_frame = data_frame.dropna()

### Validate nulls have been dropped

In [17]:
data_frame.isnull().sum()

symbol                0
name                  0
sector                0
price                 0
price_per_earnings    0
dividend_yield        0
earnings_per_share    0
52_week_low           0
52_week_high          0
market_cap            0
ebitda                0
price_per_sales       0
price_per_book        0
sec_filings           0
dtype: int64

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

### Drop Duplicates

In [23]:
data_frame.drop_duplicates()
data_frame.count()

symbol                478
name                  478
sector                478
price                 478
price_per_earnings    478
dividend_yield        478
earnings_per_share    478
52_week_low           478
52_week_high          478
market_cap            478
ebitda                478
price_per_sales       478
price_per_book        478
sec_filings           478
dtype: int64