# Metasub data mGPS algorithm - 31/03/2025

Here I am just trying to get the data pre-processing steps right. The idea for doing this is to get the dataset in the right format for easier analysis using neural networks.

## Pre-processing the data

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Import datasets
os.chdir("/home/inf-21-2024/binp37/")
# Read the metadata for the metasub data.
complete_meta = pd.read_csv("./data/metasub/complete_metadata.csv")
taxa_abund = pd.read_csv("./data/metasub/metasub_taxa_abundance.csv")
taxa_abund = taxa_abund.drop_duplicates(subset=['uuid'])

In [3]:
# Merge the bacterial and metadata
metasub_data = pd.merge(complete_meta,taxa_abund,on='uuid')
metasub_data.shape

(4288, 3711)

In [4]:
# Remove control samples
control_cities = {'control','other_control','neg_control','other','pos_control'}
control_types = {'ctrl cities','negative_control','positive_control'}

mask = metasub_data['city'].isin(control_cities) | metasub_data['control_type'].isin(control_types)
metasub_data = metasub_data[~mask].copy()
metasub_data.shape

(4157, 3711)

In [5]:
#Re-label london boroughs
metasub_data.loc[metasub_data['city'].isin(['kensington','islington']),'city'] = 'london'
metasub_data.shape

(4157, 3711)

In [6]:
# Remove sparse sample locations and doubtful samples
city_counts = metasub_data['city'].value_counts()
small_cities = city_counts[city_counts<8].index.tolist()
remove_samples = metasub_data['city'].isin(['antarctica']+small_cities)
metasub_data = metasub_data[~remove_samples]
metasub_data.shape

(4070, 3711)

In [7]:
# Correct the identified mislabeling of data
kyiv_filter = metasub_data['city'] == 'kyiv'
metasub_data.loc[kyiv_filter,'latitude'] = metasub_data.loc[kyiv_filter,'city_latitude'] # Set all the latitude to the city_latitude
metasub_data.loc[kyiv_filter,'longitude'] = metasub_data.loc[kyiv_filter,'city_longitude'] # Set all the latitude to the city_longitutde

porto_filter = metasub_data['city'] == 'porto'
metasub_data.loc[porto_filter,'city'] = "europe"
metasub_data.shape

(4070, 3711)

In [8]:
# Fill missing latitude and longitude values with city-level data
missing_lat = metasub_data["latitude"].isna()
missing_lon = metasub_data["longitude"].isna()
metasub_data.loc[missing_lat, "latitude"] = metasub_data.loc[missing_lat, "city_latitude"]
metasub_data.loc[missing_lon, "longitude"] = metasub_data.loc[missing_lon, "city_longitude"]
metasub_data.shape

(4070, 3711)

In [9]:
# Correction for incorrect London co-ordinates
london_filter = metasub_data['city'] == 'london'
metasub_data.loc[london_filter,'city_latitude'] = 51.50853
metasub_data.loc[london_filter,'city_longitude'] = -0.12574
metasub_data.shape

(4070, 3711)

In [10]:
metasub_data.iloc[:,:42].columns

Index(['uuid', 'metasub_name', 'core_project', 'project', 'city', 'city_code',
       'latitude', 'longitude', 'surface_material', 'control_type',
       'elevation', 'line', 'station', 'surface', 'temperature', 'traffic',
       'setting', 'num_reads', 'library_post_PCR_Qubit',
       'library_QC_concentration', 'city_latitude', 'city_longitude',
       'coastal_city', 'city_total_population', 'city_population_density',
       'city_land_area_km2', 'city_ave_june_temp_c', 'city_elevation_meters',
       'continent', 'city_koppen_climate', 'barcode', 'ha_id',
       'hudson_alpha_flowcell', 'hudson_alpha_project', 'index_sequence',
       'location_type', 'hudson_alpha_uid', 'other_project_uid',
       'plate_number', 'plate_pos', 'sample_type', 'sl_name'],
      dtype='object')

## Recursive Feature Elimination - 02/04/2025

Here we use RFE as a part of a pipeline to get the best set of parameters for fitting the deep learning model. We need a suitable set of parameters for that are infromative enough to apply a deep learning model. These are called as geographically informative taxa (GITs).

In [11]:
# Importing libraries
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from xgboost import XGBClassifier
from joblib import Parallel, delayed
from sklearn.pipeline import make_pipeline
import multiprocessing
import time
import ray
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold

  from .autonotebook import tqdm as notebook_tqdm
2025-04-05 10:59:45,640	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


*Explanation of the Code*

This function is designed to perform feature selection by:

1. **Removing Highly Correlated Features:**
   - Computes a correlation matrix of the features.
   - Identifies features with correlation greater than 0.98 and removes them.

2. **Recursive Feature Elimination (RFE) for Feature Selection:**
   - Uses a Random Forest Classifier to determine feature importance.
   - Applies RFE (Recursive Feature Elimination) to iteratively remove the least important features.
   - The number of features to keep is determined based on predefined subset sizes.

3. **Parallel Processing Support:**  
   Uses multiple CPU cores when specified for efficiency.


In [None]:
# Feature selection algorithm
def species_select(X,y,remove_correlated=True,subsets=None,cores=1):
    """
    Feature selection algorithm for species classification.

    Parameters:
    X (pd.DataFrame): Feature matrix
    y (pd.Series): Target variable
    remove_correlated (bool): Whether we need to remoce highly correlated variables. (default is set to True)
    subsets (list): List of feature subset sizes to evaluate. If None, it is determined automatically.
    cores (int): Number of CPU cores to use for parallel computation. (default is set to 1)
    
    Returns:
    RFE object: Trained Recursive Feature Elimination (RFE) model.
    """

    start_time = time.time()  # Track execution time

    # Set parallel processing
    num_cores = multiprocessing.cpu_count() if cores > 1 else 1
    print(f"Using {num_cores} CPU cores for computation.")

    if remove_correlated:
        # Compute correlation matrix
        print("Calculating correlation matrix...")
        corr_matrix = X.corr()
        upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        # Identify correlated features (above 0.98)
        correlated_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.98)]

        # Drop correlated features
        X = X.drop(columns=correlated_features)
        print(f"Correlated feature removed: {len(correlated_features)}")

    # Determine default subset sizes if not provided
    num_features = X.shape[1]
    if subsets is None:
        subsets = [num_features // 2, num_features // 4, num_features // 8, num_features // 16, num_features // 32, num_features // 64]
        subsets = [s for s in subsets  if s > 0] # Remove non-positive values

    print(f"Feature selection subsets: {subsets}")

    # Define model (Random Forrest for fearure ranking)
    model = RandomForestClassifier(n_jobs=num_cores, random_state=123)
    print("Initialized RandomForestClassifier.")

    # Recursive Feature Elimination (RFE)
    for subset in subsets:
        print(f"\nStarting RFE with {subset} features....")
        start_rfe = time.time()
        rfe = RFE(estimator=model, n_features_to_select=min(subsets),step=20)
        # Fit RFE to the data
        rfe.fit(X,y)
        print(f"Completed RFE with {subset} features in {time.time() - start_rfe:.2f} seconds.")

    total_time = time.time() - start_time
    print(f"\nFeature selection completed in {total_time:.2f} seconds.")
    return rfe

In [None]:
featureElim = species_select(X=metasub_data.iloc[:,42:500],
                             y=metasub_data['city'],
                             remove_correlated=False,
                             subsets=[50,100,200,500],
                             cores=12)

### What You Can Do If It's Still Too Slow

1. **Reduce Feature Set Before Running RFE:**
    - If remove_correlated=False, you still have 3711 features.
    - Try using only top 500-1000 features based on variance or importance.

2. **Increase Step Size in RFE:**
    - Default step=1 removes one feature per iteration, which is slow.
    - Try step=5 or step=10 to remove multiple features at once:
    - rfe = RFE(estimatoe=model, n_features_to_select=subset, step=5)
3. **Use XGBoost Instead of Random Forest:**
    - model = XGBClassifier(n_jobs=num_cores, random_state=123)
4. **Sequential Feature Selector:**
    - from sklearn.feature_selection import SequentialFeatureSelector
    sfs = SequentialFeatureSelector(model, n_features_to_select=100, direction='backward', n_jobs=num_cores)
    sfs.fit(X, y)


In [None]:
# Initialize Ray
ray.shutdown()
ray.init(ignore_reinit_error=True,num_cpus=100)

model = RandomForestClassifier(n_jobs=1,random_state=123)
parameters = {"rfe__n_features_to_select":[50,100,200,300,500,1500]}
cv = 10
n_features_options = parameters['rfe__n_features_to_select']
total_iterations = len(n_features_options) * cv

print(f"\nStarting RFE with subsets of features...")

# Define remote function for parallel excecution
@ray.remote
def evaluate_rfe(n_features, fold, X, y):
    """Perfroms RFE feature selection and evaluates performance for a given fold."""
    pipe = make_pipeline(RFE(estimator=model, n_features_to_select=n_features,step=10))

    # We use the stratified K fold to split the data into training and validation sets
    skf = StratifiedKFold(n_splits=cv,shuffle=True, random_state=fold)
    train_index, test_index = list(skf.split(X,y))[fold]

    # train_index and test_index contain the index values for extracting training and testing data from X and y variables.
    X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Fit the model using the training data and the evaluate the score based on the testing data
    pipe.fit(X_train, y_train)
    score = pipe.score(X_test, y_test)

    return n_features, fold, score

start_time = time.time()
tasks = [evaluate_rfe.remote(n_features,fold, X,y) for n_features in n_features_options for fold in range(cv)]

results = []
with tqdm(total=total_iterations, desc='Parallel RFE + Cross-validation') as pbar:
    while tasks:
        done, tasks = ray.wait(tasks, num_returns=1)
        result = ray.get(done[0])
        results.append((result[0],result[2])) # (n_features, score)
        pbar.update(1)

# Aggregate mean accuracy for each feature subset
results_df = pd.DataFrame(results, columns=["n_features", "accuracy"])
results_df = results_df.groupby("n_features").mean().reset_index()

# Find best feature subset
best_n_features, best_score = results_df.loc[results_df["accuracy"].idxmax()].values

end_time = time.time()
elapsed_time = end_time - start_time

print(f'Best params: {{"rfe__n_features_to_select": {int(best_n_features)}}}')
print(f'Best accuracy: {best_score:.6f}')
print(f'Total time taken: {elapsed_time:.2f} seconds')

ray.shutdown()



2025-04-03 09:17:52,877	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m



Starting RFE with subsets of features...


Parallel RFE + Cross-validation: 100%|██████████| 60/60 [1:15:27<00:00, 75.46s/it]   


Best params: {"rfe__n_features_to_select": 200}
Best accuracy: 0.891155
Total time taken: 4531.09 seconds


In [12]:
def parallel_rfe_feature_selection(X: pd.DataFrame, y: pd.Series, n_jobs: int = 1, random_state: int = 123,
                                   cv: int = 10, subsets: list = None, remove_correlated: bool = True,
                                   correlation_threshold: float = 0.98, num_cpus: int = None):
    """
    Performs parallel Recursive Feature Elimination (RFE) with cross-validation to select the best feature subset.

    Args:
        X (pd.DataFrame): DataFrame of features.
        y (pd.Series): Series of the target variable.
        n_jobs (int): Number of jobs for the base estimator (RandomForestClassifier).
        random_state (int): Random state for reproducibility.
        cv (int): Number of cross-validation folds.
        subsets (list, optional): List of feature subset sizes to evaluate. If None, default subsets are used. Defaults to None.
        remove_correlated (bool, optional): Whether to remove highly correlated features before RFE. Defaults to True.
        correlation_threshold (float, optional): Threshold for identifying highly correlated features. Defaults to 0.98.
        num_cpus (int, optional): Number of CPUs to use for Ray. If None, Ray will auto-detect. Defaults to None.

    Returns:
        tuple: A tuple containing:
            - best_params (dict): Dictionary with the best RFE parameters.
            - best_accuracy (float): The best mean cross-validation accuracy achieved.
            - results_df (pd.DataFrame): DataFrame containing the mean accuracy for each feature subset size.
            - elapsed_time (float): Total time taken for the feature selection process.
    """
    if ray.is_initialized():
        ray.shutdown()
    ray.init(ignore_reinit_error=True, num_cpus=num_cpus)

    model = RandomForestClassifier(n_jobs=n_jobs, random_state=random_state)

    if remove_correlated:
        # Compute correlation matrix
        print("Calculating correlation matrix...")
        corr_matrix = X.corr()
        upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        # Identify correlated features (above threshold)
        correlated_features = [column for column in upper_tri.columns if any(upper_tri[column] > correlation_threshold)]

        # Drop correlated features
        X = X.drop(columns=correlated_features)
        print(f"Correlated features removed: {len(correlated_features)}")

    # Determine default subset sizes if not provided
    num_features = X.shape[1]
    if subsets is None:
        subsets = [num_features // 2, num_features // 4, num_features // 8, num_features // 16, num_features // 32, num_features // 64]
        subsets = [s for s in subsets if s > 0]  # Remove non-positive values

    n_features_options = sorted(list(set(subsets))) # Ensure unique and sorted subset sizes
    total_iterations = len(n_features_options) * cv

    print(f"\nStarting RFE with subsets of features: {n_features_options}")

    # Define remote function for parallel execution
    @ray.remote
    def evaluate_rfe_remote(n_features, fold, X_remote, y_remote):
        """Performs RFE feature selection and evaluates performance for a given fold."""
        pipe = make_pipeline(RFE(estimator=model, n_features_to_select=n_features, step=10))

        # We use the stratified K fold to split the data into training and validation sets
        skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=fold)
        train_index, test_index = list(skf.split(X_remote, y_remote))[fold]

        # train_index and test_index contain the index values for extracting training and testing data
        X_train = X_remote.iloc[train_index, :]
        X_test = X_remote.iloc[test_index, :]
        y_train = y_remote.iloc[train_index]
        y_test = y_remote.iloc[test_index]

        # Fit the model using the training data and then evaluate the score based on the testing data
        pipe.fit(X_train, y_train)
        score = pipe.score(X_test, y_test)

        return n_features, fold, score

    start_time = time.time()
    X_ray = ray.put(X)
    y_ray = ray.put(y)
    tasks = [evaluate_rfe_remote.remote(n_features, fold, X_ray, y_ray)
             for n_features in n_features_options for fold in range(cv)]

    results = []
    with tqdm(total=total_iterations, desc='Parallel RFE + Cross-validation') as pbar:
        while tasks:
            done, tasks = ray.wait(tasks, num_returns=1)
            result = ray.get(done[0])
            results.append((result[0], result[2]))  # (n_features, score)
            pbar.update(1)

    # Aggregate mean accuracy for each feature subset
    results_df = pd.DataFrame(results, columns=["n_features", "accuracy"])
    results_df = results_df.groupby("n_features").mean().reset_index()

    # Find best feature subset
    best_row = results_df.loc[results_df["accuracy"].idxmax()]
    best_n_features = int(best_row["n_features"])
    best_accuracy = best_row["accuracy"]

    end_time = time.time()
    elapsed_time = end_time - start_time

    best_params = {"rfe__n_features_to_select": best_n_features}

    ray.shutdown()

    return best_params, best_accuracy, results_df, elapsed_time

X = metasub_data.iloc[:,42:400]
y= metasub_data['city']

best_parameters, best_score, all_results, time_taken = parallel_rfe_feature_selection(
        X=X,
        y=y,
        n_jobs=-1,  # Use all available cores for RandomForest within each Ray task
        random_state=123,
        cv=5,
        subsets=[50, 100, 200, 300, 500],
        remove_correlated=True,
        correlation_threshold=0.95,
        num_cpus=50  # Limit Ray to 4 CPUs for this example
    )

print(f'\nBest params: {best_parameters}')
print(f'Best accuracy: {best_score:.6f}')
print(f'Mean accuracy for all tested feature subsets:\n{all_results}')
print(f'Total time taken: {time_taken:.2f} seconds')

2025-04-05 11:00:26,661	INFO worker.py:1777 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


Calculating correlation matrix...
Correlated features removed: 34

Starting RFE with subsets of features: [50, 100, 200, 300, 500]


Parallel RFE + Cross-validation: 100%|██████████| 25/25 [00:35<00:00,  1.43s/it]



Best params: {'rfe__n_features_to_select': 50}
Best accuracy: 0.749631
Mean accuracy for all tested feature subsets:
   n_features  accuracy
0          50  0.749631
1         100  0.742015
2         200  0.733661
3         300  0.732187
4         500  0.735135
Total time taken: 35.92 seconds


In [13]:
# Since I have lost the model, but I know that 200 features gives the best accuracy of 0.89. 
X = metasub_data.iloc[:,42:]
y = metasub_data['city']
model_200 = RandomForestClassifier(n_jobs=24,random_state=123)
rfe = RFE(estimator=model_200,n_features_to_select=200,step=20)
rfe.fit(X,y)

selected_features = X.columns[rfe.support_]
print(selected_features)

print(len(rfe.support_))
# All the accuracy results from the previous runs
#results_df = pd.DataFrame(results,columns=['n_vars','accuracy'])
#results_df.to_csv('mgps_git_taxa.csv')

Index(['Acidovorax ebreus', 'Acidovorax sp. JS42', 'Acidovorax sp. KKS102',
       'Acinetobacter baumannii', 'Acinetobacter haemolyticus',
       'Acinetobacter johnsonii', 'Acinetobacter junii',
       'Acinetobacter pittii', 'Acinetobacter schindleri',
       'Acinetobacter sp. LoGeW2-3',
       ...
       'Thermothelomyces thermophila', 'Thielavia terrestris',
       'Truepera radiovictrix', 'Tsukamurella sp. MH1',
       'Variovorax boronicumulans', 'Variovorax paradoxus',
       'Variovorax sp. PAMC 28711', 'Veillonella parvula', 'Weissella cibaria',
       'Xanthomonas campestris'],
      dtype='object', length=200)
3669


## Initial Neural Networks Tests - 03/04/2025

In [14]:
import torch
import torch.nn as nn
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

### Data Preparation

In [15]:
nn_data = pd.concat([metasub_data[selected_features],metasub_data[['continent','city','latitude','longitude']]],axis=1)
nn_data.head()

Unnamed: 0,Acidovorax ebreus,Acidovorax sp. JS42,Acidovorax sp. KKS102,Acinetobacter baumannii,Acinetobacter haemolyticus,Acinetobacter johnsonii,Acinetobacter junii,Acinetobacter pittii,Acinetobacter schindleri,Acinetobacter sp. LoGeW2-3,...,Variovorax boronicumulans,Variovorax paradoxus,Variovorax sp. PAMC 28711,Veillonella parvula,Weissella cibaria,Xanthomonas campestris,continent,city,latitude,longitude
0,0.0,0.0,0.00023,0.00015,0.0,6e-05,1e-05,7e-05,0.0001,5e-05,...,0.00031,0.00075,0.00021,0.0,0.0,0.0048,oceania,hamilton,-37.78333,175.28333
1,0.0,1e-05,3e-05,0.00028,0.00016,0.00142,0.00017,0.00013,0.00262,0.0014,...,0.00013,0.00024,3e-05,0.0,0.0,0.00091,oceania,hamilton,-37.78333,175.28333
2,3e-05,0.0,0.00011,0.00181,0.0006,0.00274,0.0003,0.0011,0.00191,0.00132,...,0.0001,0.00025,1e-05,0.0,0.0,0.00208,oceania,hamilton,-37.78333,175.28333
3,0.0,0.0,0.0,2e-05,1e-05,3e-05,0.0,0.0,3e-05,1e-05,...,3e-05,2e-05,0.0,0.0,0.0,0.00137,oceania,hamilton,-37.78333,175.28333
4,0.0,0.0,0.0,3e-05,0.0,0.0,0.0,2e-05,9e-05,1e-05,...,4e-05,8e-05,3e-05,0.0,0.0,0.00397,oceania,hamilton,-37.78333,175.28333


#### Convert the city and continent names into vectors

In [16]:
print(f"The unique cities in the metasub dataset are {len(list(nn_data['city'].unique()))}.")
print(f"The unique continents in the metasub dataset are {list(nn_data['continent'].unique())}")

The unique cities in the metasub dataset are 40.
The unique continents in the metasub dataset are ['oceania', 'south_america', 'east_asia', 'sub_saharan_africa', 'middle_east', 'north_america', 'europe']


In [17]:
# Initialize label and scalers
le = LabelEncoder()
stdscaler = StandardScaler() # I can try MinMaxScaler as well
# Convert all the categorical variables into numbers
nn_data['city_encoding'] = nn_data[['city']].apply(le.fit_transform)
nn_data['continent_encoding'] = nn_data[['continent']].apply(le.fit_transform)
nn_data['lat_scaled'] = stdscaler.fit_transform(nn_data[['latitude']])
nn_data['long_scaled'] = stdscaler.fit_transform(nn_data[['longitude']])
# Store all the new scaled and encoded data in a new dataframe
encoded_nn_data = nn_data.drop(columns=['city','continent','latitude','longitude'],axis=1)
encoded_nn_data.head()

Unnamed: 0,Acidovorax ebreus,Acidovorax sp. JS42,Acidovorax sp. KKS102,Acinetobacter baumannii,Acinetobacter haemolyticus,Acinetobacter johnsonii,Acinetobacter junii,Acinetobacter pittii,Acinetobacter schindleri,Acinetobacter sp. LoGeW2-3,...,Variovorax boronicumulans,Variovorax paradoxus,Variovorax sp. PAMC 28711,Veillonella parvula,Weissella cibaria,Xanthomonas campestris,city_encoding,continent_encoding,lat_scaled,long_scaled
0,0.0,0.0,0.00023,0.00015,0.0,6e-05,1e-05,7e-05,0.0001,5e-05,...,0.00031,0.00075,0.00021,0.0,0.0,0.0048,10,4,-3.548641,1.899948
1,0.0,1e-05,3e-05,0.00028,0.00016,0.00142,0.00017,0.00013,0.00262,0.0014,...,0.00013,0.00024,3e-05,0.0,0.0,0.00091,10,4,-3.548641,1.899948
2,3e-05,0.0,0.00011,0.00181,0.0006,0.00274,0.0003,0.0011,0.00191,0.00132,...,0.0001,0.00025,1e-05,0.0,0.0,0.00208,10,4,-3.548641,1.899948
3,0.0,0.0,0.0,2e-05,1e-05,3e-05,0.0,0.0,3e-05,1e-05,...,3e-05,2e-05,0.0,0.0,0.0,0.00137,10,4,-3.548641,1.899948
4,0.0,0.0,0.0,3e-05,0.0,0.0,0.0,2e-05,9e-05,1e-05,...,4e-05,8e-05,3e-05,0.0,0.0,0.00397,10,4,-3.548641,1.899948


### Splitting the dataset - Cross Validation

In [18]:
# KFold - Shuffle=True, I will do this later. For now, I will just use the train_test_split.
kf = KFold(n_splits=5,shuffle=True, random_state=123)

X = encoded_nn_data.iloc[:,:200].values
y = encoded_nn_data[['continent_encoding','city_encoding','lat_scaled','long_scaled']].values 

for train_idx, val_idx in kf.split(X,y[:,1]): # We will use only the city column to create the split. Based on the ordering of the columns in the previous cell.
    X_train = pd.DataFrame(X[train_idx])
    y_train = pd.DataFrame(y[train_idx])

    X_test = pd.DataFrame(X[val_idx])
    y_test = pd.DataFrame(y[val_idx])

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(3256, 200)
(3256, 4)
(814, 200)
(814, 4)


### Neural Network 

In [19]:
import torch
import torch.nn as nn

In [20]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [21]:
# train_test_split
X_train, X_test, y_train, y_test = train_test_split(encoded_nn_data.iloc[:,:200].values,
                                                    encoded_nn_data[['continent_encoding','city_encoding','lat_scaled','long_scaled']].values,
                                                    test_size=0.2,
                                                    random_state=123)

In [22]:
class CustDat(torch.utils.data.Dataset):
    def __init__(self,df,target):
        self.df = df.astype(np.float32)
        self.target = target
    def __len__(self):
        return self.df.shape[0]
    def __getitem__(self,idx):
        dp = self.df[idx]
        targ = self.target[idx]
        dp = torch.from_numpy(dp)
        targ = torch.tensor(targ,dtype=torch.long)[0] # I am getting only the continent here.
        return dp,targ

In [23]:
CustDat(X_train,y_train).__getitem__(200)

(tensor([1.5000e-04, 1.5000e-04, 1.2300e-03, 2.8000e-04, 1.8000e-04, 1.3000e-03,
         1.4000e-04, 1.4000e-04, 1.1100e-03, 2.2000e-04, 8.5000e-04, 8.0000e-05,
         0.0000e+00, 2.2300e-03, 0.0000e+00, 1.0000e-05, 2.0200e-03, 2.1000e-03,
         8.8000e-04, 5.3630e-02, 8.8000e-04, 1.7500e-03, 6.4800e-03, 9.6000e-04,
         6.6000e-04, 8.1000e-04, 4.8000e-04, 2.7000e-04, 1.1000e-03, 4.0000e-05,
         0.0000e+00, 9.3000e-04, 3.0000e-05, 0.0000e+00, 4.0000e-05, 9.0000e-05,
         1.4000e-04, 2.4000e-04, 7.6000e-04, 9.0300e-03, 4.5000e-04, 2.2600e-03,
         9.9800e-03, 1.3200e-03, 1.0720e-02, 3.3245e-01, 1.7700e-03, 9.4000e-04,
         7.3000e-04, 3.7100e-03, 6.9400e-03, 1.4000e-04, 3.8000e-04, 3.0000e-04,
         1.0000e-05, 1.0000e-04, 9.0000e-05, 1.0000e-04, 4.6000e-04, 1.4000e-04,
         1.7900e-03, 2.6000e-03, 0.0000e+00, 1.0300e-03, 9.1000e-04, 0.0000e+00,
         8.5000e-04, 1.5000e-03, 6.6000e-04, 6.5000e-04, 3.1000e-03, 0.0000e+00,
         3.8500e-03, 2.0000e

In [24]:
train_dl = torch.utils.data.DataLoader(CustDat(X_train,y_train),
                                       batch_size=64,shuffle=True,num_workers=4,pin_memory=False)

test_dl = torch.utils.data.DataLoader(CustDat(X_test,y_test),
                                       batch_size=64,shuffle=True,num_workers=4,pin_memory=False)

In [29]:
# Writing the class for the netural network continent, citites and latitude and longitutude

class NeuralNetContinent(nn.Module):
    def __init__(self,input_size_continents,num_continents):
        super(NeuralNetContinent,self).__init__()
        self.layer1 = nn.Linear(input_size_continents,400) # 200 GITs
        self.layer2 = nn.Linear(400,400)
        self.layer3 = nn.Linear(400,200)
        self.layer4 = nn.Linear(200,num_continents) # 7 continents
        self.relu = nn.ReLU()

    def forward(self,x):
        
        out = self.relu(self.layer1(x))
        out = self.relu(self.layer2(out))
        out = self.relu(self.layer3(out))
        out = self.layer4(out)

        return out
    
class NeuralNetCities(nn.Module):
    def __init__(self, input_size_cities, num_cities):
        super(NeuralNetCities,self).__init__()
        self.layer1 = nn.Linear(input_size_cities,400) # 207 GITs
        self.layer2 = nn.Linear(400,400)
        self.layer3 = nn.Linear(400,200)
        self.layer4 = nn.Linear(200,num_cities) # 40 continents
        self.relu = nn.ReLU()

    def forward(self,x):
        
        out = self.relu(self.layer1(x))
        out = self.relu(self.layer2(out))
        out = self.relu(self.layer3(out))
        out = self.layer4(out)

        return out
        
    
# Hyperparameters
input_size_continents = 200
input_size_cities = 207
num_continents = 7
num_cities = 40
learning_rate = 0.001
batch_size = 64
num_epochs = 10

# Initialize thr network
nn_continent_model = NeuralNetContinent(input_size_continents=input_size_continents,num_continents=num_continents).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(nn_continent_model.parameters(),lr=learning_rate)

In [30]:
start_time = time.time()  # Start time
for epoch in range(num_epochs):
    epoch_start_time = time.time() # start time of each epoch
    for batch_idx, (data, target) in enumerate(train_dl):
        data = data.to(device=device)
        target = target.to(device=device)
                
        # scores
        scores = nn_continent_model(data)

        loss = criterion(scores, target)

        # backward
        optimizer.zero_grad()
        loss.backward()

        # gradient descent or adam step
        optimizer.step()
    epoch_end_time = time.time() # end time of each epoch
    epoch_duration = epoch_end_time - epoch_start_time
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}, Epoch Time: {epoch_duration:.2f} seconds")

end_time = time.time()  # End time
total_duration = end_time - start_time
print(f"Total Training Time: {total_duration:.2f} seconds")

Epoch 1/10, Loss: 1.4372, Epoch Time: 1.37 seconds
Epoch 2/10, Loss: 1.0299, Epoch Time: 1.43 seconds
Epoch 3/10, Loss: 1.0685, Epoch Time: 1.39 seconds
Epoch 4/10, Loss: 0.9691, Epoch Time: 1.32 seconds
Epoch 5/10, Loss: 0.9231, Epoch Time: 1.30 seconds
Epoch 6/10, Loss: 0.3848, Epoch Time: 1.30 seconds
Epoch 7/10, Loss: 0.8575, Epoch Time: 1.38 seconds
Epoch 8/10, Loss: 0.4016, Epoch Time: 1.38 seconds
Epoch 9/10, Loss: 0.7561, Epoch Time: 1.24 seconds
Epoch 10/10, Loss: 0.4587, Epoch Time: 1.30 seconds
Total Training Time: 13.41 seconds


In [31]:
# Check accuracy on training and test to see how good model is
def check_accuracy(loader,model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x,y in loader:
            x = x.to(device=device)
            y = y.to(device=device)

            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)

        print(f'Got {num_correct}/{num_samples} with accuracy {float(num_correct)/float(num_samples)*100}')

    model.train() 


check_accuracy(train_dl,nn_continent_model)
check_accuracy(test_dl,nn_continent_model)

Got 2651/3256 with accuracy 81.41891891891892
Got 639/814 with accuracy 78.50122850122851
