## Introduction to Pandas

In [1]:
import pandas as pd

In [2]:
# 2 main datatypes: Series & DataFrame
# Series = 1-D
# DF.    = 2-D

series = pd.Series(["BMW", "Toyota", "Honda"])
colours = pd.Series(["Red", "Blue", "White"])

In [3]:
car_data = pd.DataFrame({"Car make": series, "Colour" : colours})
car_data

Unnamed: 0,Car make,Colour
0,BMW,Red
1,Toyota,Blue
2,Honda,White


In [None]:
# Import data
car_sales = pd.read_csv("./data/car-sales.csv")

In [None]:
car_sales

## Describe Data

In [None]:
# Attributes
car_sales.dtypes

In [None]:
car_sales.columns

In [None]:
car_colums = car_sales.columns
car_colums

In [None]:
car_sales.index

In [None]:
# Function
car_sales.describe()

In [None]:
car_sales.info()

In [None]:
car_sales.mean()

In [None]:
car_prices = pd.Series([3000,1500,112045])
car_prices.mean()

In [None]:
car_sales["Doors"].sum()

In [None]:
len(car_sales)

## Viewing and selecting data

In [None]:
car_sales.head()

In [None]:
car_sales.tail()

In [None]:
# .loc and .iloc
animals = pd.Series(["cat", "dog", "Bird", "panda", "snake"], index=[0,3,9,8,3])

In [None]:
animals

In [None]:
# loc refers to index
animals.loc[3]

In [None]:
# .iloc refers to position
animals.iloc[3]

### Boolean Indexing

In [None]:
car_sales[car_sales["Odometer (KM)"] > 100000]

In [None]:
pd.crosstab(car_sales["Make"], car_sales["Doors"])

In [None]:
# Groupy
car_sales.groupby(["Make"]).mean()

In [None]:
car_sales["Odometer (KM)"].plot()

In [None]:
car_sales["Odometer (KM)"].hist() #150000 & 200000 consider as Outliner

In [None]:
car_sales['Price'] = car_sales['Price'].replace('[\$\,\.]',"",regex=True).astype(int)

In [None]:
car_sales.plot()

In [None]:
car_sales["Make"] = car_sales["Make"].str.lower()

In [None]:
car_sales_missing = pd.read_csv("./data/car-sales-missing-data.csv")
car_sales_missing

In [None]:
car_sales_missing["Odometer"].fillna(car_sales_missing["Odometer"].mean(), inplace = True)
car_sales_missing

In [None]:
car_sales_missing.dropna(inplace=True)
car_sales_missing

## Create new Columns for Pandas DF

In [None]:
#columns from series

seat_columns = pd.Series([5,5,5,5,5])

# New column added
car_sales["Seats"] = seat_columns
car_sales

In [None]:
car_sales["Seats"].fillna(5, inplace = True)

In [None]:
car_sales

In [None]:
fuel_economy = [7.5, 9.2, 5.0, 9.6, 8.7, 4.7, 7.6,8.7,3.0,4.5]
car_sales["Fuel per 100KM"] = fuel_economy
car_sales

In [None]:
car_sales["Total fuel used (L)"] = car_sales["Odometer (KM)"]/100*car_sales["Fuel per 100KM"]

In [None]:
car_sales

In [None]:
car_sales["Number of wheels"] = 4
car_sales["Passed road safety"] = True
car_sales

## Sampling Data

In [None]:
#Shuffle all the row
car_sales_shuffled = car_sales.sample(frac=1)
car_sales_shuffled

In [None]:
# Take a sample of the data to practise

#Only select 20% of data
car_sales_shuffled.sample(frac = 0.2)

In [None]:
# Revert the original index
car_sales_shuffled.reset_index(drop=True, inplace=True)

In [None]:
car_sales_shuffled

In [None]:
car_sales["Odometer (KM)"] = car_sales["Odometer (KM)"].apply(lambda x: x/1.6)

In [None]:
car_sales