In [4]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import gc

# Define a function to load data in chunks
def load_data_in_chunks(file_path, chunksize=10000):
    chunks = []
    for chunk in pd.read_csv(file_path, chunksize=chunksize, encoding='unicode_escape'):
        chunks.append(chunk)
    return pd.concat(chunks, ignore_index=True)

# Load datasets in chunks
tree_inventory = load_data_in_chunks('tree_inventory.csv')
roads = load_data_in_chunks('roads.csv')
census_income = load_data_in_chunks('census_2021_income.csv')
air_quality = load_data_in_chunks('reduced_air_quality.csv')


# Display the shape of sampled datasets
print(f'Tree Inventory Sampled Shape: {tree_inventory.shape}')
print(f'Roads Sampled Shape: {roads.shape}')
print(f'Census Income Sampled Shape: {census_income.shape}')
print(f'Air Quality Sampled Shape: {air_quality.shape}')



# Function to create GeoDataFrame
def create_geodataframe(df, lon_col, lat_col):
    df['geometry'] = df.apply(lambda row: Point(row[lon_col], row[lat_col]), axis=1)
    return gpd.GeoDataFrame(df, geometry='geometry')

# Create GeoDataFrames
gdf_tree_inventory = create_geodataframe(tree_inventory, 'x', 'y')
gdf_air_quality = create_geodataframe(air_quality, 'POD_LONGITUDE', 'POD_LATITUDE')



# Function to perform spatial join in chunks
def spatial_join_in_chunks(left_gdf, right_gdf, batch_size=1000):
    result_list = []
    for i in range(0, len(left_gdf), batch_size):
        batch = left_gdf.iloc[i:i+batch_size]
        merged_batch = gpd.sjoin_nearest(batch, right_gdf, how='left')
        result_list.append(merged_batch)
        # Free up memory
        del batch, merged_batch
        gc.collect()
    return pd.concat(result_list, ignore_index=True)

# Perform spatial join in chunks
tree_air_quality_merged = spatial_join_in_chunks(gdf_tree_inventory, gdf_air_quality)

# Drop geometry columns
tree_air_quality_merged = tree_air_quality_merged.drop(columns=['geometry', 'index_right'])

# Merge with roads based on 'OBJECTID_left'
tree_air_roads_merged = tree_air_quality_merged.merge(roads, left_on='OBJECTID_left', right_on='OBJECTID', how='left')

# Merge with census_income based on a relevant geographical or demographical identifier
# Ensure both DataFrames have a common column for merging
final_merged_df = tree_air_roads_merged.merge(census_income, left_on='GeoID', right_on='GEO_CODE', how='left')

# Display the final merged DataFrame info
print(final_merged_df.info())
print(final_merged_df.head())

# Assuming 'IN_NEED_REGION' is a binary target variable (0 - not in need, 1 - in need)
final_merged_df['IN_NEED_REGION'] = (final_merged_df['POP_2021'] < final_merged_df['POP_2021'].median()).astype(int)  # Example criteria

# Select features and target variable
features = ['POP_2021', 'TOT_INC_STAT_2020_15PLUS']
target = 'IN_NEED_REGION'

# Handle missing values
X = final_merged_df[features].fillna(0)
y = final_merged_df[target].fillna(0)

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Initialize and train the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(random_state=1)
model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Assuming 'IN_NEED_REGION' is a binary target variable (0 - not in need, 1 - in need)
final_merged_df['IN_NEED_REGION'] = (final_merged_df['POP_2021'] < final_merged_df['POP_2021'].median()).astype(int)  # Example criteria

# Select features and target variable
features = ['POP_2021', 'TOT_INC_STAT_2020_15PLUS']
target = 'IN_NEED_REGION'

# Handle missing values
X = final_merged_df[features].fillna(0)
y = final_merged_df[target].fillna(0)

# Split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Initialize and train the model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(random_state=1)
model.fit(X_train, y_train)

Tree Inventory Sampled Shape: (91182, 51)
Roads Sampled Shape: (19791, 52)
Census Income Sampled Shape: (380, 281)
Air Quality Sampled Shape: (368, 32)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6929832 entries, 0 to 6929831
Columns: 416 entries, OBJECTID_left to Shape__Length
dtypes: float64(334), int64(9), object(73)
memory usage: 21.5+ GB
None
   OBJECTID_left    TREEID  Closest Civic Number      STREET LOCATION_left  \
0              1  10007057                  27.0  ACTIVA AVE     BOULEVARD   
1              1  10007057                  27.0  ACTIVA AVE     BOULEVARD   
2              1  10007057                  27.0  ACTIVA AVE     BOULEVARD   
3              1  10007057                  27.0  ACTIVA AVE     BOULEVARD   
4              1  10007057                  27.0  ACTIVA AVE     BOULEVARD   

                     SPECIES_NAME  \
0  Autumn Brilliance Serviceberry   
1  Autumn Brilliance Serviceberry   
2  Autumn Brilliance Serviceberry   
3  Autumn Brilliance Servic

ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0