# Introduction to Numpy/Pandas


[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/UCL-DSPP-2425/ECON0128-lectures/blob/main/09_python/intro_pandas.ipynb)

## numpy

In [2]:
# load numpy into the namespace
import numpy as np

In [3]:
# build a 1D array
a = np.array([1, 2, 3, 4, 5]) # this is similar to an R Vector where you can't mix data types.

# build a 2D array
b = np.array([[1, 2, 8], [4, 5, 6], [7, 8, 10]])

# build a 3D array
c = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])

In [4]:
# get the shape of the array

print(a.shape)

(5,)


In [5]:
# get the type of the array

print(a.dtype)

int64


In [None]:
# convert the type of the array

print(a.astype(float))

In [6]:
# math operations

print(a + 1)
print(a - 1)
print(a * 2)
print(a / 2)

[2 3 4 5 6]
[0 1 2 3 4]
[ 2  4  6  8 10]
[0.5 1.  1.5 2.  2.5]


In [None]:
# matrix operations

print(np.dot(b, b))

b_inv = np.linalg.inv(b)

print(b_inv)
print(np.dot(b, b_inv))

print(np.linalg.det(b))

In [None]:
# apply functions to arrays

print(np.sin(a))
print(np.exp(a))
print(np.log(a))

In [None]:
# to explore what you can you do to an object
dir(a)

In [None]:
print(a.mean())
print(a.std())

print(b.mean())
print(b.mean(axis=0)) # mean along the columns
print(b.mean(axis=1)) # mean along the rows

## pandas

In [9]:
# load pandas into memory
# Standard way of using dataframes in R
# Polars new dataframe style
import pandas as pd

In [None]:
!pip install pandas

In [None]:
# what are we importing?
pd.

In [10]:
# create a dataframe from a dictionary (countries and currency)
my_dictionary = {"country_name": ["Spain", "Colombia", "Turkey"], "currency": ["Euro", "Peso", "Lira"]}
my_dictionary

{'country_name': ['Spain', 'Colombia', 'Turkey'],
 'currency': ['Euro', 'Peso', 'Lira']}

In [11]:
df = pd.DataFrame(my_dictionary)
df

Unnamed: 0,country_name,currency
0,Spain,Euro
1,Colombia,Peso
2,Turkey,Lira


In [12]:
# read data from a csv file (Colab sample files: sample_data/california_housing_train.csv)
df = pd.read_csv("sample_data/california_housing_train.csv")
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250,65500.0
...,...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571,111400.0
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179,79000.0
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313,103600.0
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797,85800.0


In [None]:
# reading data from other file types (JSON)
df_json = pd.read_json("sample_data/anscombe.json")
df_json

In [None]:
# reading data from other file types (Excel, Stata). Data files are not available
#df_excel = pd.read_excel("data/iris.xlsx", sheet_name="Sheet 1")
#df_stata = pd.read_stata("data/iris.dta")

In [None]:
# read data from file with a different separator. Data file not available
#df = pd.read_csv("data/iris.txt", sep="\t")

### Working with DataFrames

In [None]:
df

In [None]:
# describe dataframe
# Summary statistics for the dataframe
df.describe()

In [None]:
# get all the column names from a dataframe
df.columns

In [None]:
type(df) # returns the type of the dataframe

In [None]:
# returns rows and columns of the data
df.shape

In [None]:
# Similar to the structure function in R
df.info()

In [None]:
# select a column using its name
df["population"]

In [14]:
# Another way
df.population

Unnamed: 0,population
0,1015.0
1,1129.0
2,333.0
3,515.0
4,624.0
...,...
16995,907.0
16996,1194.0
16997,1244.0
16998,1298.0


In [None]:
# select a row using its index

In [None]:
# exploring the data rowwise
# retuns the 2nd row of the dataset
df.loc[2]

In [None]:
# select a particular cell
df.loc[2, "population"]

In [None]:
# filter data
df.loc[df["population"] > 1000]

In [None]:
df.loc[df["households"] > 1000]

In [None]:
df.loc[(df["population"] > 1000) & (df["households"] > 1000)]

In [None]:
# sum all elements of a column (what is the total population?)
df["population"].sum()

In [None]:
df["population"].mean()

In [None]:
# apply a custom function to all elements of a column
df["population"].apply(lambda x: x**2 + 10)

In [None]:
# create a new column
df["population_new"] = df["population"].apply(lambda x: x**2 + 10)
df

In [None]:
df["rooms_per_person"] = df["total_rooms"] / df["population"]
df

### repeat data science first program

In [None]:
# read data from a public URL
url = f"https://www.dropbox.com/s/xk3vm6l8jaw2k9o/income_by_year_age.csv?dl=1"
df = pd.read_csv(url)
df

In [None]:
# the index of any dataset will be assigned automatically but you can simply change that by
# This will now make the year as the index of the dataset.
df.set.index("Year")

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df["16-24"].head()

In [None]:
print(df["16-24"].mean())
print(df["16-24"].std())

In [None]:
# plot the variable "16-24"

df["16-24"].plot()

In [None]:
# plot the variable "16-24" and the variable "65+" in the same plot

df["16-24"].plot()
df["65+"].plot()

In [None]:
# compute the mean of all but the first column

age_means = df.iloc[:, 1:].mean()
age_means

In [None]:
# histogram of age_means

age_means.plot(kind="bar")