In [17]:
import numpy as np
#vectorization increases  memory performance
#ufuncs: enhnances computational efficiency and support memory_saving operations

import numpy as np
import pandas as pd
from sklearn import datasets
import time

#Define a larger array
large_array = np.random.rand(10**6)

#numpy way
start = time.time()
print("Numpy Sum", np.sum(large_array)) #calculate the sum of the array using numpy vectorization
print("Time taken is ", time.time() - start)

#python way
start1 = time.time()
print("Built-in-list sum", sum(large_array)) #calculate the sum of the array using numpy vectorization
print("Time taken is to calculate the sum in a Python List", time.time() - start1)
california = datasets.fetch_california_housing()
#combine the feature data with target variable
data_with_feature = np.c_[california['data'], california['target']]
df = pd.DataFrame(data=data_with_feature,
                  columns=california['feature_names']+['target'])
print(df.head())
df['AveRooms'] = pd.Categorical(df['AveRooms']) #convert a column of Average rooms to categorical for memory efficient
df['MedInc'] = df['MedInc'].astype('category')
print(df.head())

#using downcasting for memory efficient or changing the type
#Downcast data type of avebedrooms to float
df['AveBedrms'] = pd.to_numeric(df['AveBedrms'], downcast='float')
df['Population'] = df['Population'].astype('int32')
print(df.head())
#using method chaining
df_copy = df[df['Population'] > 1000].copy()
df_copy.dropna(inplace=True)
print(df_copy.head())
#great way with method chaining
df =df[df['Population']> 1000].dropna()
print(df.head())
#calculating for memory usage
def memory_usage_pandas(df1):
    bytes = df1.memory_usage(deep=True).sum()
    return bytes/1024**2 #convert bytes to megabytes
original_memory = memory_usage_pandas(df)

#optimized usage of pandas using categorical
#we will use downcasting for optimization
df['AveBedrms'] = pd.to_numeric(df['AveBedrms'], downcast='float')
df['AveRooms'] = pd.to_numeric(df['AveRooms'], downcast='float')
optimiziedMemory = memory_usage_pandas(df)
print(f'Original memory {original_memory} MB')
print(f'Optimized memory {optimiziedMemory} MB')
print(f'Memory saved {original_memory - optimiziedMemory}')

start_time = time.time()
df = df.astype({'AveBedrooms': 'float32',
                'AveRooms': 'float32',
                'Population': 'int32'
               },  copy=False)
end_time = time.time()

Numpy Sum 500235.46671807347
Time taken is  0.0032274723052978516
Built-in-list sum 500235.46671806514
Time taken is to calculate the sum in a Python List 0.0831460952758789
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3  5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4  3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   

   Longitude  target  
0    -122.23   4.526  
1    -122.22   3.585  
2    -122.24   3.521  
3    -122.25   3.413  
4    -122.25   3.422  
   MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0  8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1  8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2  