# ***PART 1***

### Data Exploration with *`Pandas`*

In this part we access the California housing dataset using `pandas`, and try out some built-in `pandas` functions on it.

## **Load** the Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
df=pd.read_csv("california_housing_train.csv")

In [3]:
print("First 5:\n",df.head(5),"\n\n\nLast 10:\n",df.tail(10))

First 5:
    longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -114.31     34.19                15.0       5612.0          1283.0   
1    -114.47     34.40                19.0       7650.0          1901.0   
2    -114.56     33.69                17.0        720.0           174.0   
3    -114.57     33.64                14.0       1501.0           337.0   
4    -114.57     33.57                20.0       1454.0           326.0   

   population  households  median_income  median_house_value  
0      1015.0       472.0         1.4936             66900.0  
1      1129.0       463.0         1.8200             80100.0  
2       333.0       117.0         1.6509             85700.0  
3       515.0       226.0         3.1917             73400.0  
4       624.0       262.0         1.9250             65500.0   


Last 10:
        longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
16990    -124.22     41.73                28.0       3003.0          

## Data **Summary**

In [4]:
df.describe()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
count,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0,17000.0
mean,-119.562108,35.625225,28.589353,2643.664412,539.410824,1429.573941,501.221941,3.883578,207300.912353
std,2.005166,2.13734,12.586937,2179.947071,421.499452,1147.852959,384.520841,1.908157,115983.764387
min,-124.35,32.54,1.0,2.0,1.0,3.0,1.0,0.4999,14999.0
25%,-121.79,33.93,18.0,1462.0,297.0,790.0,282.0,2.566375,119400.0
50%,-118.49,34.25,29.0,2127.0,434.0,1167.0,409.0,3.5446,180400.0
75%,-118.0,37.72,37.0,3151.25,648.25,1721.0,605.25,4.767,265000.0
max,-114.31,41.95,52.0,37937.0,6445.0,35682.0,6082.0,15.0001,500001.0


In [5]:
df["total_bedrooms"].value_counts(dropna=True)

total_bedrooms
280.0     48
309.0     44
345.0     43
331.0     43
343.0     43
          ..
1821.0     1
4171.0     1
1508.0     1
1647.0     1
4335.0     1
Name: count, Length: 1848, dtype: int64

In [6]:
len(df["total_bedrooms"].value_counts(dropna=True))

1848

## Data **Transformation**

In [7]:
df.loc[:,"total_bedrooms_per_total_rooms"]=df.loc[:,"total_bedrooms"]/df.loc[:,"total_rooms"]

In [8]:
df.loc[:,"total_bedrooms_per_total_rooms"].mean()

np.float64(0.21275998821034803)

In [9]:
df.loc[:,"total_bedrooms_per_total_rooms"].std()

np.float64(0.05782300890316167)

## Data **Filtering**

In [10]:
df.loc[(df.loc[:,"median_income"]>5),:]

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,total_bedrooms_per_total_rooms
32,-115.39,32.76,16.0,1136.0,196.0,481.0,185.0,6.2558,146300.0,0.172535
69,-115.55,32.98,33.0,2266.0,365.0,952.0,360.0,5.4349,143000.0,0.161077
92,-115.57,32.78,20.0,1534.0,235.0,871.0,222.0,6.2715,97200.0,0.153194
98,-115.58,32.78,5.0,2494.0,414.0,1416.0,421.0,5.7843,110100.0,0.165998
100,-115.59,32.79,8.0,2183.0,307.0,1000.0,287.0,6.3814,159900.0,0.140632
...,...,...,...,...,...,...,...,...,...,...
16737,-122.89,38.38,16.0,2017.0,369.0,931.0,336.0,5.7664,267500.0,0.182945
16762,-122.94,38.57,33.0,1530.0,266.0,728.0,250.0,5.1005,266700.0,0.173856
16776,-123.00,38.33,8.0,3223.0,637.0,851.0,418.0,5.6445,364800.0,0.197642
16802,-123.17,39.18,14.0,2240.0,327.0,1030.0,308.0,5.9585,214900.0,0.145982


In [11]:
subset=df.loc[(df.loc[:,"total_rooms"]>10000) & (df.loc[:,"median_house_value"]<150000),:]
print(subset)

       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
133      -116.06     34.15                15.0      10377.0          2331.0   
191      -116.36     33.88                11.0      12557.0          3098.0   
198      -116.38     33.73                10.0      11836.0          2405.0   
201      -116.39     33.82                15.0      11115.0          2257.0   
229      -116.47     33.81                 7.0      10105.0          2481.0   
...          ...       ...                 ...          ...             ...   
12654    -121.75     38.67                 9.0      12139.0          2640.0   
12772    -121.79     36.64                11.0      32627.0          6445.0   
13388    -121.94     38.27                35.0      10869.0          2226.0   
13707    -122.00     38.25                 7.0      11768.0          1893.0   
15436    -122.31     40.55                11.0      13714.0          2302.0   

       population  households  median_income  media

## **Export** Modified Data

In [12]:
df.to_csv("california_housing_train_modified.csv",index=False)

# ***PART 2***

### Numerical Analysis with *`NumPy`*
In this part we try out various functions on `numpy` arrays.

## Array Creation and Indexing

In [13]:
integers = np.arange(1,21)

In [14]:
integers

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])

In [15]:
even_no=integers[1: : 2]

In [16]:
even_no

array([ 2,  4,  6,  8, 10, 12, 14, 16, 18, 20])

In [17]:
rand_array=np.random.randint(10,101,(5,4))

In [18]:
rand_array

array([[100,  98,  12, 100],
       [ 42,  56,  43,  43],
       [ 73,  60,  76,  71],
       [ 82,  66,  92,  62],
       [ 49,  32,  81,  50]], dtype=int32)

In [19]:
rand_array[:3,:2]

array([[100,  98],
       [ 42,  56],
       [ 73,  60]], dtype=int32)

## Array Manipulation

In [20]:
integers.resize(4,5)

In [21]:
integers

array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20]])

In [22]:
integers=integers.flatten()

In [23]:
integers

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20])

In [24]:
rand_array2=np.random.randint(10,101,(4,5))

In [25]:
np.matmul(rand_array,rand_array2)

array([[23334, 15382, 23776, 17412, 13404],
       [12081,  8021, 12328, 11307,  8654],
       [18162, 13211, 18132, 16056, 13126],
       [19090, 13680, 18808, 17884, 14832],
       [12737,  9960, 12564, 12053, 10180]], dtype=int32)

## Boolean and Fancy Indexing

In [26]:
array50=np.random.randint(0,101,(50,))
array50

array([ 53,  36,   4,   6,  40,  72,  88,  31,  22,  88,  60,  43,  80,
        91,  27,  64,  39,  62,  94,  61,  32,  94,  83,  32, 100,  16,
        58,  10,  83,  52,  32,  43,  75,  20,  68,  15,  36,   7,   8,
        14,  56,  81,  77,  41,  82,  68,  46,  83,  30,  48], dtype=int32)

In [27]:
array50[array50>50]

array([ 53,  72,  88,  88,  60,  80,  91,  64,  62,  94,  61,  94,  83,
       100,  58,  83,  52,  75,  68,  56,  81,  77,  82,  68,  83],
      dtype=int32)

In [28]:
array50[range(2,50,6)]

array([ 4, 22, 27, 32, 58, 75,  8, 82], dtype=int32)

In [29]:
array50[[1,5,8,27,39,46]]

array([36, 72, 22, 10, 14, 46], dtype=int32)

## Statistical Operations

In [30]:
rand_array

array([[100,  98,  12, 100],
       [ 42,  56,  43,  43],
       [ 73,  60,  76,  71],
       [ 82,  66,  92,  62],
       [ 49,  32,  81,  50]], dtype=int32)

In [31]:
mean_values=rand_array.mean(axis=0)
print(mean_values)

[69.2 62.4 60.8 65.2]


In [32]:
rand_array.max(axis=0)

array([100,  98,  92, 100], dtype=int32)

In [33]:
rand_array.sum(axis=0)

array([346, 312, 304, 326])

In [34]:
mask = rand_array > mean_values
print(mask)

[[ True  True False  True]
 [False False False False]
 [ True False  True  True]
 [ True  True  True False]
 [False False  True False]]


In [35]:
filtered_data=rand_array[mask]

In [36]:
filtered_data

array([100,  98, 100,  73,  76,  71,  82,  66,  92,  81], dtype=int32)

# ***PART 3***

### Problem Solving with `Python` Basics
This part uses the basic functionality of `python`

## Loops and Conditional Statements

In [37]:
a=1
b=0
while(a<100):
    print(a)
    c=a+b
    b=a
    a=c

1
1
2
3
5
8
13
21
34
55
89


In [38]:
for i in range(1,51):
    if i%3==0 and i%5==0:
        print("FizzBuzz")
    elif i%3==0:
        print("Fizz")
    elif i%5==0:
        print("Buzz")

Fizz
Buzz
Fizz
Fizz
Buzz
Fizz
FizzBuzz
Fizz
Buzz
Fizz
Fizz
Buzz
Fizz
FizzBuzz
Fizz
Buzz
Fizz
Fizz
Buzz
Fizz
FizzBuzz
Fizz
Buzz


## Variable-Length Arguments

In [39]:
def med(*values):
    lis=list(values)
    lis.sort()
    length=len(lis)
    if length%2==1:
        return lis[(length-1)//2]
    else:
        return (lis[length//2-1]+lis[length//2])/2

In [40]:
med(1,9,8,2,3,4,5)

4

In [41]:
med(1,9,8,2,3,4,5,0)

3.5

In [42]:
def mean(*values):
    length=len(values)
    sum=0
    for i in values:
        sum+=i
    return sum/length

In [43]:
mean(1,9,8,2,3,4,5)

4.571428571428571

## Error Handling

In [44]:
def divide_numbers(num1, num2):
    try:
        result = num1 / num2
    except ZeroDivisionError:
        print("Error: Division by zero is not allowed.")
    else:
        return result

In [45]:
divide_numbers(15,3)

5.0

In [46]:
divide_numbers(4,0)

Error: Division by zero is not allowed.


# ***PART 4***

### Bonus Challenge
3D `numpy` arrays

## Advanced *`NumPy`* Manipulation

#### Create a 3D NumPy array

In [47]:
array3d=np.random.randint(1,11,(3,4,5))

In [48]:
array3d

array([[[ 7,  1,  4,  1, 10],
        [ 7,  3,  8,  1,  4],
        [10,  8,  9, 10,  2],
        [ 1,  4,  6,  8,  9]],

       [[ 6,  9,  6,  8,  4],
        [ 7, 10,  3,  6,  6],
        [ 7,  6,  8,  8,  6],
        [ 9,  8,  4,  7,  5]],

       [[ 6,  6,  5,  1, 10],
        [10,  7,  3,  5,  3],
        [ 7,  6,  9,  8,  3],
        [ 1,  6,  4,  6,  3]]], dtype=int32)

#### Swap axes 1 and 2

In [49]:
array3d=np.swapaxes(array3d, 1, 2)

In [50]:
array3d

array([[[ 7,  7, 10,  1],
        [ 1,  3,  8,  4],
        [ 4,  8,  9,  6],
        [ 1,  1, 10,  8],
        [10,  4,  2,  9]],

       [[ 6,  7,  7,  9],
        [ 9, 10,  6,  8],
        [ 6,  3,  8,  4],
        [ 8,  6,  8,  7],
        [ 4,  6,  6,  5]],

       [[ 6, 10,  7,  1],
        [ 6,  7,  6,  6],
        [ 5,  3,  9,  4],
        [ 1,  5,  8,  6],
        [10,  3,  3,  3]]], dtype=int32)

#### Sum along different axes

In [51]:
array3d.sum(axis=0)

array([[19, 24, 24, 11],
       [16, 20, 20, 18],
       [15, 14, 26, 14],
       [10, 12, 26, 21],
       [24, 13, 11, 17]])

In [52]:
array3d.sum(axis=1)

array([[23, 23, 39, 28],
       [33, 32, 35, 33],
       [28, 28, 33, 20]])

In [53]:
array3d.sum(axis=2)

array([[25, 16, 27, 20, 25],
       [29, 33, 21, 29, 21],
       [24, 25, 21, 20, 19]])

#### Generate a random array and use broadcasting to perform an operation that adds a vector to each row of a 2D array

In [54]:
vector=np.random.randint(1,11,(4,))

In [55]:
vector

array([6, 9, 2, 6], dtype=int32)

In [56]:
array3d+=vector

In [57]:
array3d

array([[[13, 16, 12,  7],
        [ 7, 12, 10, 10],
        [10, 17, 11, 12],
        [ 7, 10, 12, 14],
        [16, 13,  4, 15]],

       [[12, 16,  9, 15],
        [15, 19,  8, 14],
        [12, 12, 10, 10],
        [14, 15, 10, 13],
        [10, 15,  8, 11]],

       [[12, 19,  9,  7],
        [12, 16,  8, 12],
        [11, 12, 11, 10],
        [ 7, 14, 10, 12],
        [16, 12,  5,  9]]], dtype=int32)