## dataset without duplicate rows

In [115]:
import pandas as pd
import numpy as np

# Step 1: Load the data files
# Replace these file paths with your actual file paths
health_authorities_csv = 'final/health/gp_practices_finalv2.csv'
gm_oa_lookup_excel = 'data/gm_oa_lookup2021.xlsx'

# Load the health authorities data
health_data = pd.read_csv(health_authorities_csv)
print(f"Health data shape: {health_data.shape}")
print("Health data columns:", list(health_data.columns))

# Load the GM OA lookup data containing all LSOA_IDs
# Note: You may need to adjust the sheet name and column name
lookup_data = pd.read_excel(gm_oa_lookup_excel)
print(f"Lookup data shape: {lookup_data.shape}")
print("Lookup data columns:", list(lookup_data.columns))

# Identify the column in lookup_data containing LSOA IDs
# Adjust this logic based on your actual column names
lsoa_col = [col for col in lookup_data.columns if 'LSOA' in col.upper()]
if lsoa_col:
    lsoa_col = lsoa_col[0]
    print(f"LSOA column identified: {lsoa_col}")
    # Extract all unique LSOA IDs from the lookup file
    all_lsoa_ids = lookup_data[lsoa_col].unique()
    print(f"Total unique LSOA IDs in lookup: {len(all_lsoa_ids)}")
else:
    print("No LSOA column found in lookup data. Using alternative approach.")
    # Extract LSOA IDs from health_data and add dummy IDs for demonstration
    all_lsoa_ids = health_data['LSOA_ID'].unique()
    print(f"Using {len(all_lsoa_ids)} LSOA IDs from health data only.")

# Step 2: Create a new DataFrame with all LSOA_IDs
new_df = pd.DataFrame({'LSOA_ID': all_lsoa_ids})

# Step 3: Merge with the health data, keeping all LSOA_IDs
# Use left join to keep all LSOA_IDs from the new DataFrame
merged_df = pd.merge(new_df, health_data, on='LSOA_ID', how='left')

# Step 4: Rearrange columns to ensure LSOA_ID is first
cols = merged_df.columns.tolist()
cols.remove('LSOA_ID')
cols = ['LSOA_ID'] + cols
merged_df = merged_df[cols]

# Step 5: Check for missing values and replace with NA
missing_count = merged_df.isnull().sum().sum()
print(f"Total missing values: {missing_count}")
for col in merged_df.columns:
    if merged_df[col].dtype == 'object':  # for string / categorical columns
        merged_df[col] = merged_df[col].fillna('NA')
    elif pd.api.types.is_numeric_dtype(merged_df[col]):  # for numeric columns
        merged_df[col] = merged_df[col].fillna(0)

# Step 6: Display the result
print("\nFinal dataset shape:", merged_df.shape)
print("First few rows of the rearranged dataset:")
print(merged_df.head())

# Step 7: Save the result to a new CSV file
output_file = 'final/health/gp_practices_finalv3.csv'
merged_df.to_csv(output_file, index=False)
print(f"\nSaved rearranged dataset to {output_file}")

Health data shape: (418, 4)
Health data columns: ['LSOA_ID', 'n_gp_practices', 'MSOA_ID', 'TYPE']
Lookup data shape: (8966, 12)
Lookup data columns: ['OA21CD', 'LSOA21CD', 'LSOA21NM', 'MSOA21CD', 'MSOA21NM', 'LEP22CD1', 'LEP22NM1', 'LEP22CD2', 'LEP22NM2', 'LAD22CD', 'LAD22NM', 'ObjectId']
LSOA column identified: LSOA21CD
Total unique LSOA IDs in lookup: 1702
Total missing values: 3852

Final dataset shape: (1702, 4)
First few rows of the rearranged dataset:
     LSOA_ID  n_gp_practices MSOA_ID TYPE
0  E01004772             0.0      NA   NA
1  E01004799             0.0      NA   NA
2  E01004782             0.0      NA   NA
3  E01004787             0.0      NA   NA
4  E01004798             0.0      NA   NA

Saved rearranged dataset to final/health/gp_practices_finalv3.csv


In [103]:
merged_df

Unnamed: 0,LSOA_ID,n_metrolink,n_rail_stations,MSOA_ID,TYPE
0,E01004772,,,,
1,E01004799,,,,
2,E01004782,,,,
3,E01004787,,,,
4,E01004798,,,,
...,...,...,...,...,...
1697,E01034134,,,,
1698,E01005185,,,,
1699,E01034000,,,,
1700,E01033992,,,,


In [88]:
numeric_df = merged_df.select_dtypes(include='number')
all_zero_columns = numeric_df.columns[(numeric_df == 0).all()]
print(all_zero_columns.tolist())


[]


## dataset with duplicate rows


In [89]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('final/transport/metrolink_stops_and_rail_stations_final.csv')

# 1. Count the number of rows per LSOA_ID
lsoa_counts = df['LSOA_ID'].value_counts().to_dict()

# 2. Add the count as a new column
df['n_gp_practices'] = df['LSOA_ID'].map(lsoa_counts)

# 3. Reorder columns to make LSOA_ID the first column, followed by LSOA_Count
cols = ['LSOA_ID', 'n_gp_practices'] + [col for col in df.columns if col not in ['LSOA_ID', 'n_gp_practices']]
df = df[cols]

# 4. Save the modified dataset
df.to_csv('final/transport/metrolink_stops_and_rail_stations_finalv2.csv', index=False)

print("Successfully saved!")

Successfully saved!


In [117]:
df = pd.read_csv('final/transport/gmal_final.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 71036 entries, 0 to 71035
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   postcode                  71036 non-null  object 
 1   busscore                  71036 non-null  float64
 2   railscore                 71036 non-null  float64
 3   metroscore                71036 non-null  float64
 4   locallinkscore            71036 non-null  float64
 5   locallink                 71036 non-null  object 
 6   gmalscore                 71036 non-null  float64
 7   gmallevel                 71036 non-null  int64  
 8   ptaltheme                 71036 non-null  int64  
 9   altrincham                71036 non-null  object 
 10  ashton                    71036 non-null  object 
 11  bolton                    71036 non-null  object 
 12  bury                      71036 non-null  object 
 13  manchesterairport         71036 non-null  object 
 14  manche

In [118]:
df.columns

Index(['postcode', 'busscore', 'railscore', 'metroscore', 'locallinkscore',
       'locallink', 'gmalscore', 'gmallevel', 'ptaltheme', 'altrincham',
       'ashton', 'bolton', 'bury', 'manchesterairport',
       'manchestermarketstreet', 'manchesteroxfordroad',
       'manchesterspinningfields', 'oldham', 'rochdale', 'stockport',
       'traffordcentre', 'wigan', 'LSOA_ID', 'MSOA_ID', 'TYPE'],
      dtype='object')

In [119]:
df = df.drop(['postcode',
       'locallink', 'altrincham',
       'ashton', 'bolton', 'bury', 'manchesterairport',
       'manchestermarketstreet', 'manchesteroxfordroad',
       'manchesterspinningfields', 'oldham', 'rochdale', 'stockport',
       'traffordcentre', 'wigan'], axis=1)
df = df.drop_duplicates()

In [120]:
df.head()

Unnamed: 0,busscore,railscore,metroscore,locallinkscore,gmalscore,gmallevel,ptaltheme,LSOA_ID,MSOA_ID,TYPE
0,0.0,0.0,0.0,0.0,0.0,1,1,E01006366,E02001304,Transport
1,1.753008,0.0,0.0,0.0,1.753008,2,2,E01006366,E02001304,Transport
2,1.898256,0.0,0.0,0.0,1.898256,2,2,E01006366,E02001304,Transport
3,1.961143,0.0,0.0,0.0,1.961143,2,2,E01006366,E02001304,Transport
4,2.07201,0.0,0.0,0.0,2.07201,2,2,E01006366,E02001304,Transport


In [121]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 68750 entries, 0 to 71031
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   busscore        68750 non-null  float64
 1   railscore       68750 non-null  float64
 2   metroscore      68750 non-null  float64
 3   locallinkscore  68750 non-null  float64
 4   gmalscore       68750 non-null  float64
 5   gmallevel       68750 non-null  int64  
 6   ptaltheme       68750 non-null  int64  
 7   LSOA_ID         68748 non-null  object 
 8   MSOA_ID         68748 non-null  object 
 9   TYPE            68750 non-null  object 
dtypes: float64(5), int64(2), object(3)
memory usage: 5.8+ MB


In [127]:
df.columns

Index(['busscore', 'railscore', 'metroscore', 'locallinkscore', 'gmalscore',
       'gmallevel', 'ptaltheme', 'LSOA_ID', 'MSOA_ID', 'TYPE'],
      dtype='object')

In [122]:
df.to_csv('final/transport/gmal_final.csv', index=False)
print("Successfully saved!")

Successfully saved!


## metrolink and rail

In [98]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('final/transport/metrolink_stops_and_rail_stations_final.csv')

# Create a new dataframe with the counts by LSOA_ID and network_type
# network_type 'M' is for Metrolink and 'R' is for Rail stations

# Step 1: Group by LSOA_ID, MSOA_ID, TYPE, and network_type, then count
grouped = df.groupby(['LSOA_ID', 'MSOA_ID', 'TYPE', 'network_type']).size().reset_index(name='count')

# Step 2: Pivot the table to create separate columns for Metrolink (M) and Rail (R)
pivoted = grouped.pivot_table(
    index=['LSOA_ID', 'MSOA_ID', 'TYPE'],
    columns='network_type',
    values='count',
    fill_value=0
).reset_index()

# Step 3: Rename columns to match the desired format
if 'M' in pivoted.columns:
    pivoted.rename(columns={'M': 'n_metrolink'}, inplace=True)
else:
    pivoted['n_metrolink'] = 0

if 'R' in pivoted.columns:
    pivoted.rename(columns={'R': 'n_rail_stations'}, inplace=True)
else:
    pivoted['n_rail_stations'] = 0

# Step 4: Reorder columns to match the requested format
result = pivoted[['LSOA_ID', 'n_metrolink', 'n_rail_stations', 'MSOA_ID', 'TYPE']]

# Step 5: Save to CSV with pipe delimiter
result.to_csv('final/transport/metrolink_stops_and_rail_stations_finalv2.csv', index=False)

print("Transformation complete.")
print(f"Number of unique LSOA areas: {len(result)}")

Transformation complete.
Number of unique LSOA areas: 168


## Traffic signal

In [108]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('final/transport/traffic_signal_locations_final.csv')

# Clean up type names - replace spaces and special characters with underscores
df['type'] = df['type'].str.replace(' ', '_').str.replace('(', '').str.replace(')', '')

# Create pivot table to count each type by LSOA
pivot_df = df.pivot_table(
    index=['LSOA_ID', 'MSOA_ID', 'TYPE'],
    columns='type',
    values='description',  # Using description as it should be available for each row
    aggfunc='count',
    fill_value=0
).reset_index()

# Rename columns to add 'n_' prefix to type columns
# First, get the original column names
new_columns = list(pivot_df.columns)

# Then rename all columns that are not LSOA_ID, MSOA_ID, or TYPE
for i, col in enumerate(new_columns):
    if col not in ['LSOA_ID', 'MSOA_ID', 'TYPE']:
        new_columns[i] = f'n_{col}'

# Apply the new column names
pivot_df.columns = new_columns

# Save to CSV
pivot_df.to_csv('final/transport/traffic_signal_locations_finalv2.csv', index=False)

print("Transformation complete.")
print(f"Number of unique LSOA areas: {len(pivot_df)}")

Transformation complete.
Number of unique LSOA areas: 1012


  df['type'] = df['type'].str.replace(' ', '_').str.replace('(', '').str.replace(')', '')


## GMAL

In [2]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('final/transport/gmal_final.csv')

# List of columns to calculate median and average
numeric_cols = ['busscore', 'railscore', 'metroscore', 'locallinkscore',
               'gmalscore', 'gmallevel', 'ptaltheme']

# Calculate median and mean for each numeric column, grouped by LSOA_ID
result = []

# First, get the MSOA_ID and TYPE for each LSOA (first occurrence)
lsoa_info = df.groupby('LSOA_ID')[['MSOA_ID', 'TYPE']].first()

# Calculate medians
medians = df.groupby('LSOA_ID')[numeric_cols].median()
medians.columns = [f'med_{col}' for col in numeric_cols]

# Calculate averages
averages = df.groupby('LSOA_ID')[numeric_cols].mean()
averages.columns = [f'avg_{col}' for col in numeric_cols]

# Combine results
result_df = pd.concat([lsoa_info, medians, averages], axis=1).reset_index()

# Round all numeric columns to 4 decimal places
for col in result_df.columns:
    if col not in ['LSOA_ID', 'MSOA_ID', 'TYPE']:
        result_df[col] = result_df[col].round(4)

# Reorder columns
cols = ['LSOA_ID']
for col in numeric_cols:
    cols.extend([f'med_{col}', f'avg_{col}'])
cols.extend(['MSOA_ID', 'TYPE'])

result_df = result_df[cols]

# Save to CSV with 4 decimal places
result_df.to_csv('final/transport/gmal_finalv2.csv', index=False, float_format='%.4f')

print("Done! All numeric values rounded to 4 decimal places.")
print(f"Results saved to gmal_aggregated_by_lsoa.csv with {len(result_df)} rows")

Done! All numeric values rounded to 4 decimal places.
Results saved to gmal_aggregated_by_lsoa.csv with 1702 rows


In [10]:
df = pd.read_csv('final/communications/fixed_performance_finalv2.csv')
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1702 entries, 0 to 1701
Data columns (total 33 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   LSOA_ID                                                 1702 non-null   object 
 1   median_download_speed_(mbit/s)                          1702 non-null   float64
 2   average_download_speed_(mbit/s)                         1702 non-null   float64
 3   maximum_download_speed_(mbit/s)                         1702 non-null   float64
 4   average_download_speed_(mbit/s)_for_lines_<_10mbit/s    1702 non-null   float64
 5   average_download_speed_(mbit/s)_for_lines_10<30mbit/s   1702 non-null   float64
 6   average_download_speed_(mbit/s)_for_lines_30<300mbit/s  1702 non-null   float64
 7   average_download_speed_(mbit/s)_for_sfbb_lines          1702 non-null   float64
 8   average_download_speed_(mbit/s)_for_uf

Unnamed: 0,LSOA_ID,median_download_speed_(mbit/s),average_download_speed_(mbit/s),maximum_download_speed_(mbit/s),average_download_speed_(mbit/s)_for_lines_<_10mbit/s,average_download_speed_(mbit/s)_for_lines_10<30mbit/s,average_download_speed_(mbit/s)_for_lines_30<300mbit/s,average_download_speed_(mbit/s)_for_sfbb_lines,average_download_speed_(mbit/s)_for_ufbb_lines,median_upload_speed_(mbit/s),...,number_of_connections_>=30_mbit/s_(number_of_lines),average_data_usage_(gb),median_data_usage_(gb),average_data_usage_(gb)_for_lines_<10mbits,average_data_usage_(gb)_for_lines_10<30mbit/s,average_data_usage_(gb)_for_lines_30<300mbit/s,average_data_usage_(gb)_for_sfbb_lines,average_data_usage_(gb)_for_ufbb_lines,MSOA_ID,TYPE
0,E01004766,78.483333,152.466667,1000.0,2.95,20.616667,99.433333,176.233333,661.466667,10.966667,...,95.833333,441.0,287.345833,137.0,268.333333,463.666667,468.833333,514.5,E02000988,Communications
1,E01004767,98.15,160.183333,846.666667,3.7,20.0,123.016667,194.15,486.416667,14.216667,...,99.833333,1055.666667,311.3575,125.5,291.5,426.0,1591.5,5122.666667,E02000988,Communications
2,E01004768,58.5,128.36,880.0,4.92,20.98,83.56,144.26,643.88,13.46,...,84.2,462.0,309.05,91.2,274.0,491.0,500.2,589.4,E02000984,Communications
3,E01004769,64.9,140.78,1000.0,5.3,21.04,105.04,167.0,546.88,14.1,...,94.8,406.0,270.292,26.0,408.8,429.0,429.4,441.6,E02000986,Communications
4,E01004770,97.3,181.34,1000.0,0.6,20.34,144.24,240.82,550.3,10.06,...,83.2,400.2,269.703,21.0,319.2,398.0,440.4,604.8,E02000986,Communications


# Fixed coverage

In [14]:
import pandas as pd
import numpy as np

# Load the CSV file
df = pd.read_csv('final/communications/fixed_performance_finalv2.csv')

# Define columns that should not be averaged
non_averaged_columns = ['LSOA_ID', 'output_area', 'MSOA_ID', 'TYPE']

# For categorical columns, we'll use the most common value (mode)
# First, let's identify numeric and categorical columns
numeric_columns = df.select_dtypes(include=['number']).columns
categorical_columns = [col for col in df.columns if col not in numeric_columns and col != 'LSOA_ID']

# Now calculate aggregations by LSOA_ID
aggregations = {}

# For LSOA_ID, just use it as the groupby column
# For numeric columns, use mean (average)
for col in numeric_columns:
    aggregations[col] = 'mean'

# For categorical columns, use mode (most common value)
for col in categorical_columns:
    if col not in non_averaged_columns or col == 'MSOA_ID' or col == 'TYPE':
        aggregations[col] = lambda x: x.mode()[0] if not x.mode().empty else np.nan

# Apply the aggregation
lsoa_averages = df.groupby('LSOA_ID').agg(aggregations).reset_index()

# Round all numeric columns to 2 decimal places
for col in numeric_columns:
    if col in lsoa_averages.columns:
        lsoa_averages[col] = lsoa_averages[col].round(2)

# Save the result to a CSV file
lsoa_averages.to_csv('final/communications/fixed_coverage_finalv2.csv', index=False)

print(f"Aggregation complete. File saved as 'lsoa_averages.csv'")
print(f"Number of rows in output file: {len(lsoa_averages)}")
print(f"All numeric values rounded to 2 decimal places")

Aggregation complete. File saved as 'lsoa_averages.csv'
Number of rows in output file: 1702
All numeric values rounded to 2 decimal places
