In [1]:
import numpy as np                                      # Importing numpy library module
import pandas as pd

# np.random.seed(0) == used when you want to produce same result when using random arrays

rand_array = np.random.randint(0,9, (3,4))              # (3X4) rectangle made of random digits from 0-9 
print(rand_array)
print('Indexing (2,3):', rand_array[2,3], "\n")         # Indexing works like regular python: [array part, item in list within the array]

rand_array[2,3] = 0                                     # We can also change items at an index 
print("Array after changin an index:", "\n", rand_array, "\n")

# Slicing arrays
print(rand_array[:,-1], "\n")                           # Prints last element of each index in array
print(rand_array[1:], "\n")                             # Prints all 3rd elements and beyond from each index in array
print(rand_array[:, 1:2], "\n")                         # Prints only the 2nd elements of each index in array
print(rand_array[::-1], "\n")                           # Easy way to reverse an array

rand_array[:, 0] = 999                                  # We can also assign a value to an array using slices
print(rand_array, "\n")

sorted_array = np.sort(rand_array, axis = 0)            # We can also sort an array. It doesnt change the original array.  
print("Column sorted:", sorted_array, sep= "\n")       # Axis 0 == sort each column, axis 1 == sort each row

[[2 4 2 3]
 [1 0 7 6]
 [6 0 5 1]]
Indexing (2,3): 1 

Array after changin an index: 
 [[2 4 2 3]
 [1 0 7 6]
 [6 0 5 0]] 

[3 6 0] 

[[1 0 7 6]
 [6 0 5 0]] 

[[4]
 [0]
 [0]] 

[[6 0 5 0]
 [1 0 7 6]
 [2 4 2 3]] 

[[999   4   2   3]
 [999   0   7   6]
 [999   0   5   0]] 

Column sorted:
[[999   0   2   0]
 [999   0   5   3]
 [999   4   7   6]]


In [0]:
array = np.arange(0,12)                             
print("1D:", "\n", array, "\n")

reshape_array = array.reshape(3,4)                      # Converts the 1D array to a more visually appealing 2D array
print("2D:", "\n",reshape_array)

1D: 
 [ 0  1  2  3  4  5  6  7  8  9 10 11] 

2D: 
 [[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]


Matrices multiplication in NumPy:

https://www.mathsisfun.com/algebra/matrix-introduction.html

**Notes:**

(m x n) x (n x p) == (m x p) --> Shape of resultant matrix

column of first matrix should be the same as row of second matrix 



In [0]:
a = np.random.randint(0,9, (3,2))
b = np.random.randint(0,9, (2,3))
matrix = a@b                                                  # np.matmul(a,b) also will work
print("Matrix multiplication:", matrix, sep = "\n")

scalar = np.array([2,4])*a                                    # Simple matrix multiplication with a scalar(1D array)
print("a:", a, sep = "\n")
print("Scalar multiplication:", scalar, sep = "\n")

Matrix multiplication:
[[ 1  0  0]
 [ 9  4  2]
 [14 12  6]]
a:
[[0 1]
 [1 7]
 [3 8]]
Scalar multiplication:
[[ 0  4]
 [ 2 28]
 [ 6 32]]


# **Pandas:**

In [4]:
orig_data = pd.read_csv("https://www.cs.helsinki.fi/u/jttoivon/dap/data/fmi/kumpula-weather-2017.csv")

weather = orig_data.rename(columns = {"m": "month", "d": "day", "Precipitation amount (mm)": "precipitation"})               
# Properties of dataframes can not be changed, like renaming columns for example. Above I created a copy of the original dataframe and renames some columns

weather.head()

Unnamed: 0,Year,month,day,Time,Time zone,precipitation,Snow depth (cm),Air temperature (degC)
0,2017,1,1,00:00,UTC,-1.0,-1.0,0.6
1,2017,1,2,00:00,UTC,4.4,-1.0,-3.9
2,2017,1,3,00:00,UTC,6.6,7.0,-6.5
3,2017,1,4,00:00,UTC,-1.0,13.0,-12.8
4,2017,1,5,00:00,UTC,-1.0,10.0,-17.8


In [5]:
print("The hottest temp in Kumpala, Finland:", weather["Air temperature (degC)"].max(), "C")
print("The coldest temp:", weather["Air temperature (degC)"].min(), "C")

The hottest temp in Kumpala, Finland: 19.6 C
The coldest temp: -17.8 C


In [6]:
weather["Cold?"] = weather["Air temperature (degC)"] < -5           # Creates new entry in dataframe "Cold" 
print(weather.head(), "\n")
print("Number of cold days in the year:", weather["Cold?"].sum())   # Adds the number of "True" in "Cold" entry

   Year  month  day  ... Snow depth (cm) Air temperature (degC)  Cold?
0  2017      1    1  ...            -1.0                    0.6  False
1  2017      1    2  ...            -1.0                   -3.9  False
2  2017      1    3  ...             7.0                   -6.5   True
3  2017      1    4  ...            13.0                  -12.8   True
4  2017      1    5  ...            10.0                  -17.8   True

[5 rows x 9 columns] 

Number of cold days in the year: 12


In [7]:
new_weather = weather[["precipitation", "Snow depth (cm)", "month"]].replace(-1, 0)
# The -1 in precipitation and snow depth corrupts the data in these columns, so we must replace it with something less harmful == 0

new_weather.head()

Unnamed: 0,precipitation,Snow depth (cm),month
0,0.0,0.0,1
1,4.4,0.0,1
2,6.6,7.0,1
3,0.0,13.0,1
4,0.0,10.0,1


In [8]:
group_months = new_weather.groupby("month")
# groupby splits the dataframe into subgroups. Above we split the dataframe based on the 12 months
# It is very useful when comparing data between subgroups within a data frame. For example we can compare the total rain of all months of the year

sum_rain = group_months["precipitation"].sum()   
avg_snow = group_months["Snow depth (cm)"].mean()
# We compare subgroups (months) with the sum/mean of all values of a column

group_months.get_group(5).head()
# get_group allows us to fetch all data of a subgroup. In this case we get all the data from the subgroup month of may 
avg_snow

month
1     6.903226
2     8.178571
3     3.400000
4     0.066667
5     0.000000
6     0.000000
7     0.000000
8     0.000000
9     0.000000
10    0.419355
11    0.120000
12    1.483871
Name: Snow depth (cm), dtype: float64

In [0]:
print("Total rain (mm) by each month", sum_rain, sep = "\n")

Total rain (mm) by each month
month
1      38.9
2      35.0
3      41.7
4      39.9
5      16.1
6      76.3
7      31.2
8      86.1
9      65.2
10    184.5
11    120.2
12    140.6
Name: precipitation, dtype: float64


In [0]:
print("Average snow (cm) by month", avg_snow, sep = "\n")

Average snow (cm) by month
month
1     6.903226
2     8.178571
3     3.400000
4     0.066667
5     0.000000
6     0.000000
7     0.000000
8     0.000000
9     0.000000
10    0.419355
11    0.120000
12    1.483871
Name: Snow depth (cm), dtype: float64


In [0]:
def myfilter(df):                                     # The filter function must return a boolean value
    return df["precipitation"].sum() >= 150

new_weather.groupby("month").filter(myfilter).head()         
# filter function goes through all rows of the dataframe and only returns the rows that returned true from the above function
# or in other words it returns the rows that had more than 150mm total rain per month

Unnamed: 0,precipitation,Snow depth (cm),month
273,0.0,0.0,10
274,6.4,0.0,10
275,21.5,0.0,10
276,12.7,0.0,10
277,0.6,0.0,10


# Creating DataFrames:

I skipped missing values section of pandas but it is easy to learn so search on stackexchange when needed

In [0]:
data = pd.DataFrame([["2500k", 700], ["1800k", 400], ["3600k", 600], ["2500k", 450]], columns = ["Pop", "Area"], index = ["Delhi", "Shangai", "San Francisco", "Berlin"])

print(data, "\n")                      # index = rows
print(data.loc["Berlin"], "\n")        # loc == select using label, iloc == select using index position
print(data.iloc[2])

                 Pop  Area
Delhi          2500k   700
Shangai        1800k   400
San Francisco  3600k   600
Berlin         2500k   450 

Pop     2500k
Area      450
Name: Berlin, dtype: object 

Pop     3600k
Area      600
Name: San Francisco, dtype: object


In [0]:
df = pd.DataFrame({"a": [1, 2, 3], "b": [2.0,4.0,6.0], "c": ["3","6","9"]}, index = ["One", "Two", "Three"]) 

# We can also create new dataframes using a dictionary like above

print("Original dataframe:")
print(df.dtypes)
df

Original dataframe:
a      int64
b    float64
c     object
dtype: object


Unnamed: 0,a,b,c
One,1,2.0,3
Two,2,4.0,6
Three,3,6.0,9


In [0]:
new_df = df.astype({"a": float, "b": int, "c": int})      

# We can change the data type of entire columns using astype like above 

print("Changed data type:")
print(new_df.dtypes)
new_df

Changed data type:
a    float64
b      int64
c      int64
dtype: object


Unnamed: 0,a,b,c
One,1.0,2,3
Two,2.0,4,6
Three,3.0,6,9
