The main focus of this script is to create clean datasets for easy use 

In [35]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder


In [36]:
df = pd.read_csv("/home/elsherif/Desktop/Thesis/ViewPython/data/Train Data/rooftop.csv")
df.info()
# List of numeric columns that may have commas
numeric_cols = ["Unemployment_Rate", "Average_Age"]

# Replace commas and convert to float
for col in numeric_cols:
    df[col] = df[col].str.replace(",", ".").astype(float)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14059 entries, 0 to 14058
Data columns (total 14 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   tile                         14059 non-null  object 
 1   total_rooftops               14059 non-null  int64  
 2   rooftops_without_solar       14059 non-null  int64  
 3   square_meters_with_solar_m2  14059 non-null  float64
 4   panel_area_m2                14059 non-null  float64
 5   district_number              14059 non-null  int64  
 6   year                         14059 non-null  int64  
 7   Unemployment_Rate            14059 non-null  object 
 8   Average_Age                  14059 non-null  object 
 9   Elderly_Population           14059 non-null  float64
 10  Young_Population             14059 non-null  float64
 11  Total_Population             14059 non-null  int64  
 12  Number_of_Houses             7796 non-null   float64
 13  employed        

In [37]:
tiles_with_2020 = df[df["year"] == 2020]["tile"].unique()
all_tiles = df["tile"].unique()
tiles_missing_2020 = [t for t in all_tiles if t not in tiles_with_2020]

print("Tiles missing 2020:", tiles_missing_2020)


Tiles missing 2020: ['tile_r54_c10', 'tile_r54_c22']


In [38]:
df_clean = df[~df["tile"].isin(tiles_missing_2020)].copy()
df = df_clean
df = df.drop(columns=["Number_of_Houses"])


In [39]:
le = LabelEncoder()
df['tile_encoded'] = le.fit_transform(df['tile'])
num_tiles = df['tile_encoded'].nunique()


In [40]:
df.head()

Unnamed: 0,tile,total_rooftops,rooftops_without_solar,square_meters_with_solar_m2,panel_area_m2,district_number,year,Unemployment_Rate,Average_Age,Elderly_Population,Young_Population,Total_Population,employed,tile_encoded
0,tile_r0_c0,7,5,529.887,101.361,20,2012,3.8,42.9,10627.0,7361.0,47949,649,0
1,tile_r0_c1,9,9,0.0,0.0,20,2012,3.8,42.9,10627.0,7361.0,47949,649,1
2,tile_r0_c10,90,80,3339.949,201.642,21,2012,3.2,42.2,13608.0,11579.0,69295,662,2
3,tile_r0_c11,97,90,2715.933,343.005,21,2012,3.2,42.2,13608.0,11579.0,69295,662,3
4,tile_r0_c12,42,39,5222.084,75.094,21,2012,3.2,42.2,13608.0,11579.0,69295,662,4


In [41]:
# lag_features = [1]  # lag 1 and 2 years

# for lag in lag_features:
#     df[f'panel_area_lag{lag}'] = df.groupby('tile_encoded')['panel_area_m2'].shift(lag)

df = df.sort_values(["tile_encoded", "year"])

df["panel_area_lag1"] = (
    df.groupby("tile_encoded")["panel_area_m2"].shift(1)
)

# df["years_since_prev"] = (
#     df.groupby("tile_encoded")["year"].diff()
# )


In [42]:
df

Unnamed: 0,tile,total_rooftops,rooftops_without_solar,square_meters_with_solar_m2,panel_area_m2,district_number,year,Unemployment_Rate,Average_Age,Elderly_Population,Young_Population,Total_Population,employed,tile_encoded,panel_area_lag1
9362,tile_r0_c0,7,6,181.557,14.890,20,2003,4.6,42.6,8329.0,6922.0,42823,673,0,
12494,tile_r0_c0,7,7,0.000,0.000,20,2006,5.7,42.6,9505.0,7243.0,44993,656,0,14.890
3130,tile_r0_c0,7,6,158.590,1.170,20,2009,4.6,42.7,10286.0,7240.0,46490,649,0,0.000
0,tile_r0_c0,7,5,529.887,101.361,20,2012,3.8,42.9,10627.0,7361.0,47949,649,0,1.170
4696,tile_r0_c0,6,5,156.775,4.786,20,2015,3.5,42.9,10953.0,7588.0,50257,655,0,101.361
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1565,tile_r9_c9,125,113,4051.710,189.448,21,2012,3.2,42.2,13608.0,11579.0,69295,662,1563,341.494
9361,tile_r9_c9,127,117,4095.074,300.374,21,2018,2.7,42.4,14366.0,12638.0,74625,664,1563,189.448
3129,tile_r9_c9,132,125,2251.183,173.403,21,2020,3.8,42.3,14631.0,13257.0,77301,665,1563,300.374
7795,tile_r9_c9,124,110,4882.512,301.224,21,2022,3.2,41.9,14852.0,14062.0,80309,666,1563,173.403


In [43]:
df_model = df.dropna(subset=['panel_area_lag1'])


In [44]:
df_model

Unnamed: 0,tile,total_rooftops,rooftops_without_solar,square_meters_with_solar_m2,panel_area_m2,district_number,year,Unemployment_Rate,Average_Age,Elderly_Population,Young_Population,Total_Population,employed,tile_encoded,panel_area_lag1
12494,tile_r0_c0,7,7,0.000,0.000,20,2006,5.7,42.6,9505.0,7243.0,44993,656,0,14.890
3130,tile_r0_c0,7,6,158.590,1.170,20,2009,4.6,42.7,10286.0,7240.0,46490,649,0,0.000
0,tile_r0_c0,7,5,529.887,101.361,20,2012,3.8,42.9,10627.0,7361.0,47949,649,0,1.170
4696,tile_r0_c0,6,5,156.775,4.786,20,2015,3.5,42.9,10953.0,7588.0,50257,655,0,101.361
7796,tile_r0_c0,7,6,157.732,102.425,20,2018,2.6,43.2,10913.0,7429.0,49898,657,0,4.786
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1565,tile_r9_c9,125,113,4051.710,189.448,21,2012,3.2,42.2,13608.0,11579.0,69295,662,1563,341.494
9361,tile_r9_c9,127,117,4095.074,300.374,21,2018,2.7,42.4,14366.0,12638.0,74625,664,1563,189.448
3129,tile_r9_c9,132,125,2251.183,173.403,21,2020,3.8,42.3,14631.0,13257.0,77301,665,1563,300.374
7795,tile_r9_c9,124,110,4882.512,301.224,21,2022,3.2,41.9,14852.0,14062.0,80309,666,1563,173.403


In [45]:
df_model.to_csv("final_model.csv", index=False)


In [46]:
df_model.info

<bound method DataFrame.info of              tile  total_rooftops  rooftops_without_solar  \
12494  tile_r0_c0               7                       7   
3130   tile_r0_c0               7                       6   
0      tile_r0_c0               7                       5   
4696   tile_r0_c0               6                       5   
7796   tile_r0_c0               7                       6   
...           ...             ...                     ...   
1565   tile_r9_c9             125                     113   
9361   tile_r9_c9             127                     117   
3129   tile_r9_c9             132                     125   
7795   tile_r9_c9             124                     110   
12493  tile_r9_c9             124                     112   

       square_meters_with_solar_m2  panel_area_m2  district_number  year  \
12494                        0.000          0.000               20  2006   
3130                       158.590          1.170               20  2009   
0      