In [1]:
import pandas as pd
from sklearn.cluster import MiniBatchKMeans
import joblib
import glob

#mlenv python version 3.9.7

In [None]:
# function which reads all csv files in a folder and returns a dataframe representation
def getCSVData(sector):
    
    # declare variable to hold data frame
    df = pd.DataFrame()
    
    # declare variable to hold cleaned CSV data
    cleanedData = []
       
    # get reference to all CSV files in subfolders      
    pathname = "data/" + sector + "/*.csv"      
    
    allFiles = []    
    
    for file in glob.iglob(pathname, recursive=True):
        allFiles.append(file)         
               
    #print('Number of files found = ', str(len(allFiles)) + ' for sector = ', sector)
    
    # for each CSV file in specified path
    for aFile in allFiles:   
         
        # reading CSV data
        CSVData = pd.read_csv(aFile, usecols=['Latitude', 'Longitude'])     
        
        # filter out any CSV rows with missing data          
        CSVData = CSVData.loc[pd.notna(CSVData['Latitude'])
                        & pd.notna(CSVData['Longitude'])]
        
        # append data to array of all data
        cleanedData.append(CSVData)
        
    # convert to data frame
    df = pd.concat(cleanedData) 
  
    # rearrange columns
    df = df[['Latitude', 'Longitude']]   
    df.rename({'Latitude': 'latitude', 'Longitude': 'longitude'}, axis=1, inplace=True) 
   
    # return the data frame
    return df

sectors = [  
    { # Northumbria, Durham, Cleveland
        'sector': 'Sector1', 
        'clusters': 500,   
    },
    { # Cumbria, Lancashire  
        'sector': 'Sector2', 
        'clusters': 500, 
    },
    { # North Yorkshire, West Yorkshire  
        'sector': 'Sector3', 
        'clusters': 500, 
    },
    { # Humberside, South Yorkshire 
        'sector': 'Sector4', 
        'clusters': 500, 
    },
    { # Merseyside, Cheshire   
        'sector': 'Sector5', 
        'clusters': 500, 
    },
    { # Greater Manchester
        'sector': 'Sector6', 
        'clusters': 500,  
    },  
    { # Derbyshire, Nottinghamshire, Lincolnshire, Leicestershire, Northamptonshire
        'sector': 'Sector7', 
        'clusters': 500, 
    },    
    { # West Mercia, Staffordshire, West Midlands, Warwickshire, 
        'sector': 'Sector8', 
        'clusters': 500, 
    },    
    { # Gloucestershire, Avon & Somerset, Devon & Cornwall
        'sector': 'Sector9', 
        'clusters': 500, 
    },    
    { # Wiltshire, Dorset, Hampshire (includes Isle of wight)
        'sector': 'Sector10', 
        'clusters': 500, 
    },    
    { # Thames Valley, Hertfordshire, Bedfordshire
        'sector': 'Sector11', 
        'clusters': 500, 
    },    
    { # Cambridgeshire, Norfolk, Suffolk, Essex
        'sector': 'Sector12', 
        'clusters': 500, 
    },    
    { # Surrey, Sussex, Kent
        'sector': 'Sector13', 
        'clusters': 500, 
    },    
    { # London (including metropolitan)
        'sector': 'Sector14', 
        'clusters': 500, 
    },    
]       

for record in sectors:
    record['df'] = getCSVData(record['sector'])   


In [3]:
# set parameters for KMeans mini
def setMiniBatchParams(number_clusters):   

    kMini = MiniBatchKMeans(   
        batch_size= 3072, 
        n_clusters=number_clusters, 
        random_state=49
    )
    
    #return miniBatchKMeans model
    return kMini
 
for record in sectors:
    record['kMini_params'] = setMiniBatchParams(record['clusters']) 

In [4]:
# assign lat and lon to a 'locations' data frame
for record in sectors:
    currentDF = record['df']   
    locations = currentDF[['latitude', 'longitude']]   
    record['locations'] = locations

In [5]:
# build cluster model using location data
for record in sectors:  
    record['kMini_Model'] = record['kMini_params'].fit( record['locations'])       

In [6]:
# save the model to disk 
for record in sectors: 
    filename = 'kmini_models/KMini_' + record['sector'] + '.sav' 
    joblib.dump(record['kMini_Model'], filename)    