# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [1]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [2]:
csvpath = Path ("/Users/ddevii/Rutgers/Rutgers_Personal_Repo/01-Lesson-Plans/03-Python-Pandas/2/Activities/05-Stu_Data_Cleaning/Resources/stock_data.csv")
stock_csvdata = pd.read_csv(csvpath)

### Identify the number of rows and columns (shape) in the DataFrame.

In [4]:
stock_csvdata.shape

(504, 14)

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [5]:
stock_csvdata.sample(10)

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
49,AIV,Apartment Investment & Management,Real Estate,38.21,15.6,3.876562,1.95,46.855,38.85,6156884000.0,874871000.0,6.187621,4.69,http://www.sec.gov/cgi-bin/browse-edgar?action...
427,SYY,Sysco Corp.,Consumer Staples,57.0,22.98,2.464487,2.08,64.27,48.85,30445320000.0,2988725000.0,0.729553,13.4,http://www.sec.gov/cgi-bin/browse-edgar?action...
243,ILMN,Illumina Inc,Health Care,209.54,52.25,0.0,4.92,248.97,158.0203,32295200000.0,1192000000.0,11.713953,10.89,http://www.sec.gov/cgi-bin/browse-edgar?action...
496,XRX,Xerox Corp.,Information Technology,29.8,8.87,3.207184,0.59,37.42,26.64,7938833000.0,1191000000.0,0.787766,1.49,http://www.sec.gov/cgi-bin/browse-edgar?action...
206,IT,Gartner Inc,Information Technology,114.26,36.86,0.0,2.31,142.16,97.96,10828310000.0,234935000.0,4.894499,13.47,http://www.sec.gov/cgi-bin/browse-edgar?action...
281,LLY,Lilly (Eli) & Co.,Health Care,74.21,17.34,2.932742,-0.2,89.09,75.71,84475990000.0,3459800000.0,3.754678,5.73,http://www.sec.gov/cgi-bin/browse-edgar?action...
346,OMC,Omnicom Group,Consumer Discretionary,75.91,15.27,3.183868,4.79,86.71,65.32,17377550000.0,2366000000.0,1.532716,6.66,http://www.sec.gov/cgi-bin/browse-edgar?action...
55,AJG,Arthur J. Gallagher & Co.,Financials,64.4,21.05,2.480714,2.54,70.55,53.63,11968490000.0,888000000.0,1.940022,2.92,http://www.sec.gov/cgi-bin/browse-edgar?action...
396,ROK,Rockwell Automation Inc.,Industrials,178.73,26.48,1.769255,6.35,210.72,147.67,24123220000.0,1323200000.0,3.760594,10.94,http://www.sec.gov/cgi-bin/browse-edgar?action...
134,CMI,Cummins Inc.,Industrials,165.73,16.83,2.500868,8.23,194.18,143.8301,28669230000.0,2924000000.0,1.940617,3.89,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [6]:
stock_csvdata.count()

symbol                504
name                  502
sector                501
price                 500
price_per_earnings    497
dividend_yield        499
earnings_per_share    498
52_week_low           500
52_week_high          500
market_cap            500
ebitda                492
price_per_sales       500
price_per_book        492
sec_filings           500
dtype: int64

### Identify nulls records

In [8]:
stock_csvdata.isnull().sum()

symbol                 0
name                   2
sector                 3
price                  4
price_per_earnings     7
dividend_yield         5
earnings_per_share     6
52_week_low            4
52_week_high           4
market_cap             4
ebitda                12
price_per_sales        4
price_per_book        12
sec_filings            4
dtype: int64

### Drop Null Records

In [9]:
stock_csvdata=stock_csvdata.dropna()

### Validate nulls have been dropped

In [10]:
stock_csvdata.isnull().sum()

symbol                0
name                  0
sector                0
price                 0
price_per_earnings    0
dividend_yield        0
earnings_per_share    0
52_week_low           0
52_week_high          0
market_cap            0
ebitda                0
price_per_sales       0
price_per_book        0
sec_filings           0
dtype: int64

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

### Drop Duplicates

In [12]:
stock_csvdata = stock_csvdata.drop_duplicates()

### Sample `price` field

In [20]:
stock_csvdata["price"].sample(10)

286     95.01
380    142.68
257    114.81
137     92.16
337    136.89
290        24
445     74.36
353     61.86
126     74.73
158     91.92
Name: price, dtype: object

### Clean `price` Series by replacing `$`

In [21]:
stock_csvdata["price"] = stock_csvdata["price"].str.replace("$", "")

  """Entry point for launching an IPython kernel.


### Confirm data type of `price`

In [24]:
stock_csvdata["price"]

0      222.89
2       56.27
3      108.48
5      108.48
6      185.16
        ...  
499     70.24
500      76.3
501    115.53
502     50.71
503     71.51
Name: price, Length: 478, dtype: object

### Cast `price` Series as float and then validate using `dtype`

In [25]:
stock_csvdata["price"] = stock_csvdata["price"].astype("float")

In [28]:
stock_csvdata.dtypes

symbol                 object
name                   object
sector                 object
price                 float64
price_per_earnings    float64
dividend_yield        float64
earnings_per_share     object
52_week_low           float64
52_week_high          float64
market_cap            float64
ebitda                float64
price_per_sales       float64
price_per_book        float64
sec_filings            object
dtype: object