In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import export_text
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Loading the dataset from CSV file
dataset=pd.read_csv('C:/Users/Lahja/Desktop/combined_dataset_DrashtiSHA.txt')

# removing rows with Nan
dataset=dataset[dataset['Data_Zone'].notna()]
dataset


Unnamed: 0,Data_Zone,Intermediate_Zone,Council_area,Total_population,Working_age_population_revised,Income_rate,Income_count,Employment_rate,Employment_count,CIF,...,PT_GP,PT_Post,PT_retail,crime_count,crime_rate,overcrowded_count,nocentralheat_count,overcrowded_rate,nocentralheat_rate,year
0,S01006506,Culter,Aberdeen City,904.0,605.0,0.07,60.0,0.07,40.0,60,...,8.437832,5.987087,5.711433,8.00642939150402,88.5666968086728,87.0,10.0,0.102113,0.011737,2016
1,S01006507,Culter,Aberdeen City,830.0,491.0,0.07,60.0,0.05,25.0,40,...,8.331833,7.262817,6.794300,4.00321469575201,48.2315023584579,85.0,4.0,0.101675,0.004785,2016
2,S01006508,Culter,Aberdeen City,694.0,519.0,0.05,30.0,0.03,15.0,45,...,7.853631,5.827924,5.251454,4.00321469575201,57.6832088725073,31.0,8.0,0.048212,0.012442,2016
3,S01006509,Culter,Aberdeen City,573.0,354.0,0.05,30.0,0.06,20.0,65,...,7.434491,8.311862,8.444698,*,*,42.0,6.0,0.072414,0.010345,2016
4,S01006510,Culter,Aberdeen City,676.0,414.0,0.10,70.0,0.07,30.0,75,...,5.141013,6.627376,6.619130,12.009644087256,177.657456912071,50.0,7.0,0.086655,0.012132,2016
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8037,S01010987,Newbattle and Dalhousie,Midlothian,653.0,451.0,0.01,5.0,0.02,9.0,55.0,...,11.889787,18.775646,9.483275,3,47,5.0,0.0,0.010000,0.000000,2020
8038,S01008512,Craiglockhart,City of Edinburgh,522.0,320.0,0.00,0.0,0.01,3.0,20.0,...,7.680317,8.135619,9.035738,9,175,6.0,2.0,0.010000,0.000000,2020
8039,S01008537,Comiston and Swanston,City of Edinburgh,659.0,366.0,0.00,0.0,0.02,6.0,20.0,...,10.419287,9.911003,19.339926,3,46,15.0,6.0,0.020000,0.010000,2020
8040,S01008622,Marchmont West,City of Edinburgh,512.0,406.0,0.00,0.0,0.00,2.0,25.0,...,4.719671,6.735116,6.548124,4,79,81.0,15.0,0.160000,0.030000,2020


In [3]:
# Splitting data into X features and y target
X = dataset.drop("Employment_rate", axis=1)  
y = dataset["Noquals"]

X = pd.get_dummies(X, columns=X.select_dtypes(include=['object']).columns)

# Setting my data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
# Creating my Decision Tree Regressor
regressor = DecisionTreeRegressor(
    criterion='squared_error', # my split criterion ('mse' for mean squared error)
    min_samples_split=2,    # Min samples to split a node
    max_features=None,      # features required for best split
    random_state=None,     
    max_depth=None,          # Max_depth of the tree 
    min_samples_leaf=1,     # Min samples required at a leaf node
)


In [5]:
#Encoding
label_encoder = LabelEncoder()


In [6]:
# Fitting my regressor to the training data
regressor.fit(X_train, y_train)


In [7]:
# Setting my predictions on the test
y_pred = regressor.predict(X_test)

In [8]:
# Calculating the mean squared error
mse = mean_squared_error(y_test, y_pred)

# Print the MSE
print(f"Mean Squared Error: {mse:.2f}")

Mean Squared Error: 76.23


In [9]:
# Display the decision tree
tree_rules = export_text(regressor, feature_names=list(X.columns))
print("Decision Tree Rules:")
print(tree_rules)

Decision Tree Rules:
|--- Noquals <= 109.74
|   |--- Noquals <= 62.52
|   |   |--- Noquals <= 38.32
|   |   |   |--- crime_rate_83.6421443647711 <= 0.50
|   |   |   |   |--- Attendance_0.7575757576 <= 0.50
|   |   |   |   |   |--- HESA_0.19047619 <= 0.50
|   |   |   |   |   |   |--- Data_Zone_S01008447 <= 0.50
|   |   |   |   |   |   |   |--- crime_rate_86 <= 0.50
|   |   |   |   |   |   |   |   |--- DEPRESS_0.100244498777506 <= 0.50
|   |   |   |   |   |   |   |   |   |--- DEPRESS_0.0998003992015968 <= 0.50
|   |   |   |   |   |   |   |   |   |   |--- crime_count_13.0576496674058 <= 0.50
|   |   |   |   |   |   |   |   |   |   |   |--- truncated branch of depth 675
|   |   |   |   |   |   |   |   |   |   |--- crime_count_13.0576496674058 >  0.50
|   |   |   |   |   |   |   |   |   |   |   |--- value: [37.29]
|   |   |   |   |   |   |   |   |   |--- DEPRESS_0.0998003992015968 >  0.50
|   |   |   |   |   |   |   |   |   |   |--- value: [26.58]
|   |   |   |   |   |   |   |   |--- DEPRES