In [5]:
import pandas as pd

In [None]:
numbers = pd.Series([1,2,3,4,5])  # call class Series in pandas package
# returns one dimensional array with an index that labels the data

In [10]:
print(numbers)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [13]:
print(numbers.values)  # returns just the numbers in array without indexes

[1 2 3 4 5]


In [11]:
print(numbers.index)  # returns the indexes of the series

RangeIndex(start=0, stop=5, step=1)


In [23]:
house_price = [1.2, 0.8, 1.1, 0.6]
house_price_df = pd.Series(house_price, index = ["Auck", "Chch", "Welly", "Ham"])
print(house_price_df)
# Change the index to meaningful names

Auck     1.2
Chch     0.8
Welly    1.1
Ham      0.6
dtype: float64


In [24]:
# Find the price of house for Auckland using string indexes
print(f"Price of house in Auckland is", house_price_df["Auck"], "million")

Price of house in Auckland is 1.2 million


In [25]:
# Slicing data out of the series is also possible
print(house_price_df[1:3])

Chch     0.8
Welly    1.1
dtype: float64


In [39]:
# Add new data to the end of the series using .append command
new_series = pd.Series([0.9], index=["Gisb"])
house_price_df = house_price_df.append(new_series)
print(house_price_df)

Auck     1.2
Chch     0.8
Welly    1.1
Ham      0.6
Gisb     0.9
dtype: float64


In [40]:
# Remove Gisborne from the series
house_price_df = house_price_df.drop(["Gisb"])
print(house_price_df)

Auck     1.2
Chch     0.8
Welly    1.1
Ham      0.6
dtype: float64


In [41]:
# Convert Python array to Pandas Series.
array0 = [100, 101, 102, 103]
pandas_series = pd.Series(array0)
pandas_series.index = ["Auck", "Chch", "Welly", "Ham"]
print(pandas_series)

Auck     100
Chch     101
Welly    102
Ham      103
dtype: int64


In [46]:
# Convert Numpy package to series in Pandas
import numpy as np

In [47]:
n0 = np.random.randn(4)
print(n0)

[ 0.46649404 -1.24084379  0.25895684  2.05275899]


In [87]:
numpy_2_pandas = pd.Series(n0)
numpy_2_pandas.index = ["Auck", "Chch", "Welly", "Ham"]
print(numpy_2_pandas)

Auck     0.466494
Chch    -1.240844
Welly    0.258957
Ham      2.052759
dtype: float64


In [51]:
# Name the array of values and index meaningful labels
house_price_df.name = "House Prices"
house_price_df.index.name = "Cities"
print(house_price_df)

Cities
Auck     1.2
Chch     0.8
Welly    1.1
Ham      0.6
Name: House Prices, dtype: float64


In [61]:
# Alternatively, use dictionary and then convert to pandas series
house_price_dic = {"Auck": 1.2, 
                   "Chch": 0.8, 
                   "Welly": 1.1, 
                   "Ham": 0.6}
house_price_dic2series = pd.Series(house_price_dic)
house_price_dic2series.index.name = "cities"
print(house_price_dic2series)

cities
Auck     1.2
Chch     0.8
Welly    1.1
Ham      0.6
dtype: float64


In [85]:
# Filter the data
# Filter cities with more than 350,000 people

population = [123, 2312, 442, 3322]
pop_series = pd.Series(population, index=["Auck", "Chch", "Welly", "Ham"])
pop_series.name = "population"

print(f'cities with population greater than 350,000\n{pop_series[pop_series>350]}')
print(f'cities with population less than 1,000,000\n{pop_series[pop_series<1000]}')

cities with population greater than 350,000
Chch     2312
Welly     442
Ham      3322
Name: population, dtype: int64
cities with population less than 1,000,000
Auck     123
Welly    442
Name: population, dtype: int64


In [138]:
# Create a DataFrame in Pandas by concatenating multiple series
# Alternatively, we can use pd.DataFrame() class to dictionaries
df = pd.concat([house_price_df, pop_series, numpy_2_pandas], axis=1)
print(df)


SyntaxError: invalid syntax (<ipython-input-138-6428379cccd9>, line 5)

In [141]:
# Add meaning column name to the df
df.columns = ['Price', 'Population', 'Score']
print(df)

       Price  Population     Score
Auck     1.2         123  0.466494
Chch     0.8        2312 -1.240844
Welly    1.1         442  0.258957
Ham      0.6        3322  2.052759
1.1


In [98]:
# Rearrange the order of the columns
df_rearranged = df[['Population', 'Price', 'Score']]
print(df_rearranged)

       Population  Price     Score
Auck         1600    1.2  0.466494
Chch         2312    0.8 -1.240844
Welly         442    1.1  0.258957
Ham          3322    0.6  2.052759


In [91]:
# the columns are the index for data frames
print(df.columns)

Index(['Price', 'Population', 'Score'], dtype='object')


In [93]:
# Access columns using the indexes or by attribute
print(df['Price'])
print(df['Population'])
print(df['Score'])

Auck     1.2
Chch     0.8
Welly    1.1
Ham      0.6
Name: Price, dtype: float64
Auck      123
Chch     2312
Welly     442
Ham      3322
Name: Population, dtype: int64
Auck     0.466494
Chch    -1.240844
Welly    0.258957
Ham      2.052759
Name: Score, dtype: float64


In [106]:
# loc attribute returns one or more specified rows
print(df.loc['Chch'])
print(df.loc[['Auck', 'Ham']])  # double square brackets for list of indexes

House Prices       0.800000
population      2312.000000
0                 -1.240844
Name: Chch, dtype: float64
      House Prices  population         0
Auck           1.2         123  0.466494
Ham            0.6        3322  2.052759


In [96]:
# We can change the values in the data frame using loc
df.loc['Auck', 'Population'] = 1600
print(df)

       Price  Population     Score
Auck     1.2        1600  0.466494
Chch     0.8        2312 -1.240844
Welly    1.1         442  0.258957
Ham      0.6        3322  2.052759


In [112]:
# Alternatively, we can use .iloc and numeric index to locate values in df
print(df.iloc[0, 0])

1.2


In [149]:
# A slice object with labels
print(df.loc['Auck':'Welly'])
print(df.iloc[:, 0:1])
print(df.Price[["Auck", "Welly"]])

       Price  Population     Score
Auck     1.2         123  0.466494
Chch     0.8        2312 -1.240844
Welly    1.1         442  0.258957
       Price
Auck     1.2
Chch     0.8
Welly    1.1
Ham      0.6
Auck     1.2
Welly    1.1
Name: Price, dtype: float64
Price  Population  Score    
0.6    3322         2.052759    1
0.8    2312        -1.240844    1
1.1    442          0.258957    1
1.2    123          0.466494    1
dtype: int64


In [None]:
# We can add a new column to the dataframe but it either needs to be the 
# same length as the dataframe or should be just one similar value for the 
# whole column.
# Similer to series, we can use .drop command to remove columns or rows in df

In [118]:
# Remove a row in df
df_Chch_removed = df.drop("Chch")
print(df_Chch_removed)

       House Prices  population         0
Auck            1.2         123  0.466494
Welly           1.1         442  0.258957
Ham             0.6        3322  2.052759


In [119]:
# Importing data to Pandas
# First row in file is by default the header row
df = pd.read_csv("nz_cars.csv")

FileNotFoundError: [Errno 2] No such file or directory: 'nz_cars.csv'

In [None]:
# df.to_string() prints the entire DataFrame
# df prints only the first and last 5 rows of DataFrame
# df.head() prints the first 5 rows
# We can input arguments to change the number of rows displayed.
# df.head(2) will display first 2 rows
# df.tail() will display the last 5 lines

In [None]:
# We can count the number of unique values in column by 
# df.column_index.nunique() command
# We can return the unique values in the column by 
# df.column_index.unique() command

In [None]:
# We can find out the most popular value by .value_counts()