# Importing Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score, roc_curve, auc, f1_score
import warnings
warnings.filterwarnings('ignore')

# Load the dataset

In [2]:
df = pd.read_csv('House Prediction Data Set.csv', sep='\s+', header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
df.shape

(506, 14)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       506 non-null    float64
 1   1       506 non-null    float64
 2   2       506 non-null    float64
 3   3       506 non-null    int64  
 4   4       506 non-null    float64
 5   5       506 non-null    float64
 6   6       506 non-null    float64
 7   7       506 non-null    float64
 8   8       506 non-null    int64  
 9   9       506 non-null    float64
 10  10      506 non-null    float64
 11  11      506 non-null    float64
 12  12      506 non-null    float64
 13  13      506 non-null    float64
dtypes: float64(12), int64(2)
memory usage: 55.5 KB


# Assigning proper column names based on the Boston Housing dataset features

In [5]:
column_names = [
    'CRIM',     # per capita crime rate by town
    'ZN',       # proportion of residential land zoned for lots over 25,000 sq.ft.
    'INDUS',    # proportion of non-retail business acres per town
    'CHAS',     # Charles River dummy variable (1 if tract bounds river; 0 otherwise)
    'NOX',      # nitric oxides concentration (parts per 10 million)
    'RM',       # average number of rooms per dwelling
    'AGE',      # proportion of owner-occupied units built prior to 1940
    'DIS',      # weighted distances to five Boston employment centres
    'RAD',      # index of accessibility to radial highways
    'TAX',      # full-value property-tax rate per $10,000
    'PTRATIO',  # pupil-teacher ratio by town
    'B',        # 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
    'LSTAT',    # % lower status of the population
    'MEDV'      # Median value of owner-occupied homes in $1000's (TARGET)
]
df.columns = column_names
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [6]:
df.isnull().sum()

CRIM       0
ZN         0
INDUS      0
CHAS       0
NOX        0
RM         0
AGE        0
DIS        0
RAD        0
TAX        0
PTRATIO    0
B          0
LSTAT      0
MEDV       0
dtype: int64

In [7]:
df.dtypes

CRIM       float64
ZN         float64
INDUS      float64
CHAS         int64
NOX        float64
RM         float64
AGE        float64
DIS        float64
RAD          int64
TAX        float64
PTRATIO    float64
B          float64
LSTAT      float64
MEDV       float64
dtype: object

# Level - 1

## Task 1 - Data Preprocessing

### Encode categorical variables

In [8]:
df['CHAS'].unique()

array([0, 1])

In [9]:
sorted(df['RAD'].unique())

[np.int64(1),
 np.int64(2),
 np.int64(3),
 np.int64(4),
 np.int64(5),
 np.int64(6),
 np.int64(7),
 np.int64(8),
 np.int64(24)]

### Separate features and target

In [10]:
X = df.drop('MEDV', axis=1)
X

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296.0,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.90,9.14
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.90,5.33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273.0,21.0,391.99,9.67
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273.0,21.0,396.90,9.08
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273.0,21.0,396.90,5.64
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273.0,21.0,393.45,6.48


In [11]:
y = df['MEDV']
y

0      24.0
1      21.6
2      34.7
3      33.4
4      36.2
       ... 
501    22.4
502    20.6
503    23.9
504    22.0
505    11.9
Name: MEDV, Length: 506, dtype: float64

### Standardize features

In [12]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [13]:
X.describe()    # Features before standardization

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [14]:
X_scaled_df.describe()    # Features after standardization

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,-1.123388e-16,7.898820000000001e-17,2.106352e-16,-3.510587e-17,-1.965929e-16,-1.088282e-16,-1.474446e-16,-8.425408e-17,-1.123388e-16,0.0,-4.212704e-16,-7.442444e-16,-3.089316e-16
std,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099,1.00099
min,-0.4197819,-0.4877224,-1.557842,-0.2725986,-1.465882,-3.880249,-2.335437,-1.267069,-0.9828429,-1.31399,-2.707379,-3.907193,-1.531127
25%,-0.4109696,-0.4877224,-0.8676906,-0.2725986,-0.9130288,-0.5686303,-0.837448,-0.8056878,-0.6379618,-0.767576,-0.4880391,0.2050715,-0.79942
50%,-0.3906665,-0.4877224,-0.2110985,-0.2725986,-0.1442174,-0.1084655,0.3173816,-0.2793234,-0.5230014,-0.464673,0.274859,0.3811865,-0.1812536
75%,0.00739656,0.04877224,1.015999,-0.2725986,0.598679,0.4827678,0.9067981,0.6623709,1.661245,1.530926,0.8065758,0.433651,0.6030188
max,9.933931,3.804234,2.422565,3.668398,2.732346,3.555044,1.117494,3.960518,1.661245,1.798194,1.638828,0.4410519,3.548771


### Split dataset into training/testing sets

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [16]:
X_train.shape

(404, 13)

In [17]:
X_test.shape

(102, 13)

## Task 2 - Linear Regression

### Train Linear Regression model

In [18]:
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


### Make predictions

In [19]:
y_train_pred = lr_model.predict(X_train)
y_test_pred = lr_model.predict(X_test)

### Evaluate using R^2 and MSE

In [20]:
train_r2 = r2_score(y_train, y_train_pred)
train_r2

0.7508856358979672

In [21]:
test_r2 = r2_score(y_test, y_test_pred)
test_r2

0.6687594935356318

In [22]:
train_mse = mean_squared_error(y_train, y_train_pred)
train_mse

21.641412753226316

In [23]:
test_mse = mean_squared_error(y_test, y_test_pred)
test_mse

24.291119474973527

### Interpret coefficients

In [24]:
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': lr_model.coef_,
    'Abs_Coefficient': np.abs(lr_model.coef_)
}).sort_values('Abs_Coefficient', ascending=False)

print(feature_importance)
print(f"\nIntercept: {lr_model.intercept_}")

    Feature  Coefficient  Abs_Coefficient
12    LSTAT    -3.628149         3.628149
5        RM     3.115718         3.115718
7       DIS    -3.045771         3.045771
8       RAD     2.282785         2.282785
4       NOX    -1.991430         1.991430
10  PTRATIO    -1.979954         1.979954
9       TAX    -1.792605         1.792605
11        B     1.126499         1.126499
0      CRIM    -0.971494         0.971494
3      CHAS     0.706532         0.706532
1        ZN     0.701556         0.701556
2     INDUS     0.276752         0.276752
6       AGE    -0.177060         0.177060

Intercept: 22.485268239316902


## Task 3 - KNN Classification

### Creating binary classification target, using median as threshold

In [25]:
median_price = y.median()
y_binary = (y > median_price).astype(int)    # 1 for high price, 0 for low price

### Split data for classification

In [26]:
X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(X_scaled, y_binary, test_size=0.2, random_state=42, stratify=y_binary)

### Compare results for different K values

In [27]:
k_values = [1, 3, 5, 7, 9, 11, 15, 20]
knn_results = []

for k in k_values:
    # Training KNN model
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_clf, y_train_clf)
    
    # Making predictions
    y_pred = knn.predict(X_test_clf)
    
    # Calculating metrics
    accuracy = accuracy_score(y_test_clf, y_pred)
    
    # Getting precision, recall, f1 for positive class (high price)
    report = classification_report(y_test_clf, y_pred, output_dict=True)
    precision = report['1']['precision']
    recall = report['1']['recall']
    f1 = report['1']['f1-score']
    
    knn_results.append({
        'K': k,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    })
    
    print(f"{k:<3} {accuracy:<10.4f} {precision:<10.4f} {recall:<10.4f} {f1:<10.4f}")

1   0.8922     0.9149     0.8600     0.8866    
3   0.8824     0.9130     0.8400     0.8750    
5   0.8824     0.8958     0.8600     0.8776    
7   0.8627     0.8750     0.8400     0.8571    
9   0.8725     0.8627     0.8800     0.8713    
11  0.8431     0.8696     0.8000     0.8333    
15  0.8627     0.8913     0.8200     0.8542    
20  0.8627     0.9286     0.7800     0.8478    


### Find best K

In [28]:
best_k_result = max(knn_results, key=lambda x: x['F1-Score'])
best_k_result

{'K': 1,
 'Accuracy': 0.8921568627450981,
 'Precision': 0.9148936170212766,
 'Recall': 0.86,
 'F1-Score': 0.8865979381443299}

In [29]:
best_k = best_k_result['K']
best_k

1

### Train final model with best K

In [30]:
knn_best = KNeighborsClassifier(n_neighbors=best_k)
knn_best.fit(X_train_clf, y_train_clf)

0,1,2
,n_neighbors,1
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [31]:
y_pred_best = knn_best.predict(X_test_clf)

In [32]:
cm = confusion_matrix(y_test_clf, y_pred_best)
cm

array([[48,  4],
       [ 7, 43]])

In [33]:
print(classification_report(y_test_clf, y_pred_best))

              precision    recall  f1-score   support

           0       0.87      0.92      0.90        52
           1       0.91      0.86      0.89        50

    accuracy                           0.89       102
   macro avg       0.89      0.89      0.89       102
weighted avg       0.89      0.89      0.89       102



# Level - 2

## Task 2 - Logistic Regression

### Train Logistic Regression model

In [34]:
log_reg = LogisticRegression(random_state=42, max_iter=1000)
log_reg.fit(X_train_clf, y_train_clf)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,1000


### Make predictions

In [35]:
y_pred_log = log_reg.predict(X_test_clf)
y_pred_proba_log = log_reg.predict_proba(X_test_clf)[:, 1]

### Evaluate using accuracy, precision, recall

In [36]:
accuracy_log = accuracy_score(y_test_clf, y_pred_log)
accuracy_log

0.9215686274509803

In [37]:
report_log = classification_report(y_test_clf, y_pred_log, output_dict=True)
report_log

{'0': {'precision': 0.9230769230769231,
  'recall': 0.9230769230769231,
  'f1-score': 0.9230769230769231,
  'support': 52.0},
 '1': {'precision': 0.92, 'recall': 0.92, 'f1-score': 0.92, 'support': 50.0},
 'accuracy': 0.9215686274509803,
 'macro avg': {'precision': 0.9215384615384616,
  'recall': 0.9215384615384616,
  'f1-score': 0.9215384615384616,
  'support': 102.0},
 'weighted avg': {'precision': 0.9215686274509803,
  'recall': 0.9215686274509803,
  'f1-score': 0.9215686274509803,
  'support': 102.0}}

### Interpret coefficients and odds ratios

In [38]:
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': log_reg.coef_[0],
    'Odds_Ratio': np.exp(log_reg.coef_[0]),
    'Abs_Coefficient': np.abs(log_reg.coef_[0])
}).sort_values('Abs_Coefficient', ascending=False)

coef_df

Unnamed: 0,Feature,Coefficient,Odds_Ratio,Abs_Coefficient
12,LSTAT,-1.911306,0.147887,1.911306
7,DIS,-1.305965,0.270911,1.305965
8,RAD,1.24988,3.489926,1.24988
9,TAX,-0.932257,0.393664,0.932257
10,PTRATIO,-0.888759,0.411166,0.888759
5,RM,0.881534,2.414601,0.881534
4,NOX,-0.813087,0.443487,0.813087
6,AGE,-0.578278,0.560864,0.578278
3,CHAS,0.485032,1.624227,0.485032
11,B,0.320908,1.378379,0.320908


In [39]:
cm_log = confusion_matrix(y_test_clf, y_pred_log)
cm_log

array([[48,  4],
       [ 4, 46]])

## Task 2 - Decision Tree

In [40]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=None)
dt_model.fit(X_train_clf, y_train_clf)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


### Make predictions

In [41]:
y_pred_dt = dt_model.predict(X_test_clf)

### Evaluate using accuracy and F1-score

In [42]:
accuracy_dt = accuracy_score(y_test_clf, y_pred_dt)
accuracy_dt

0.8725490196078431

In [43]:
f1_dt = f1_score(y_test_clf, y_pred_dt)
f1_dt

0.8686868686868687

In [44]:
best_dt_results = []
for depth in range(1, 11):
    dt_pruned = DecisionTreeClassifier(random_state=42, max_depth=depth)
    dt_pruned.fit(X_train_clf, y_train_clf)
    y_pred_pruned = dt_pruned.predict(X_test_clf)
    
    acc = accuracy_score(y_test_clf, y_pred_pruned)
    f1 = f1_score(y_test_clf, y_pred_pruned)
    leaves = dt_pruned.get_n_leaves()
    
    best_dt_results.append({'depth': depth, 'accuracy': acc, 'f1': f1, 'leaves': leaves})
    print(f"{depth:<6} {acc:<10.4f} {f1:<10.4f} {leaves:<8}")

1      0.8431     0.8431     2       
2      0.8137     0.7912     4       
3      0.8137     0.8041     8       
4      0.8039     0.8000     14      
5      0.8235     0.8200     23      
6      0.8039     0.8039     31      
7      0.8725     0.8687     37      
8      0.8725     0.8687     38      
9      0.8725     0.8687     39      
10     0.8725     0.8687     40      


### Find best depth based on F1-score

In [45]:
best_depth_result = max(best_dt_results, key=lambda x: x['f1'])
best_depth_result

{'depth': 7,
 'accuracy': 0.8725490196078431,
 'f1': 0.8686868686868687,
 'leaves': np.int64(37)}

In [46]:
best_depth = best_depth_result['depth']
best_depth

7

### Train final pruned model

In [47]:
dt_final = DecisionTreeClassifier(random_state=42, max_depth=best_depth)
dt_final.fit(X_train_clf, y_train_clf)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,7
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [48]:
y_pred_dt_final = dt_final.predict(X_test_clf)

### Final evaluation

In [49]:
accuracy_dt_final = accuracy_score(y_test_clf, y_pred_dt_final)
accuracy_dt_final

0.8725490196078431

In [50]:
f1_dt_final = f1_score(y_test_clf, y_pred_dt_final)
f1_dt_final

0.8686868686868687

### Confusion matrix

In [51]:
cm_dt = confusion_matrix(y_test_clf, y_pred_dt_final)
cm_dt

array([[46,  6],
       [ 7, 43]])

## Task 3 - K Means Clustering

### Use elbow method to find optimal k

In [52]:
k_range = range(1, 11)
inertias = []

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertias.append(kmeans.inertia_)

for k, inertia in zip(k_range, inertias):
    print(f"{k:<3} {inertia:<12.2f}")

1   6578.00     
2   4196.16     
3   3514.39     
4   3122.44     
5   2729.99     
6   2441.81     
7   2201.39     
8   2078.37     
9   1850.08     
10  1755.60     


### Calculate elbow point (simple method: largest drop in inertia)

In [53]:
inertia_drops = []
for i in range(1, len(inertias)):
    drop = inertias[i-1] - inertias[i]
    inertia_drops.append(drop)

optimal_k = inertia_drops.index(max(inertia_drops)) + 2    # +2 because we start from k=2
optimal_k

2

### Apply K-Means with optimal k

In [54]:
kmeans_final = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
cluster_labels = kmeans_final.fit_predict(X_scaled)

### Analyze cluster characteristics

In [55]:
cluster_df = pd.DataFrame(X_scaled, columns=X.columns)
cluster_df['Cluster'] = cluster_labels
cluster_df['MEDV'] = y.values

In [56]:
cluster_counts = pd.Series(cluster_labels).value_counts().sort_index()
for cluster_id, count in cluster_counts.items():
    print(f"Cluster {cluster_id}: {count} samples ({count/len(cluster_labels)*100:.1f}%)")

Cluster 0: 329 samples (65.0%)
Cluster 1: 177 samples (35.0%)


### Analyze cluster characteristics by mean values

In [57]:
cluster_means = cluster_df.groupby('Cluster').mean()
cluster_means

Unnamed: 0_level_0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,-0.390124,0.262392,-0.620368,0.002912,-0.584675,0.243315,-0.435108,0.457222,-0.583801,-0.63146,-0.285808,0.326451,-0.446421,25.749848
1,0.725146,-0.487722,1.153113,-0.005412,1.086769,-0.452263,0.80876,-0.849865,1.085145,1.173731,0.531248,-0.606793,0.829787,16.553107


### Show relationship between clusters and target variable

In [58]:
cluster_target_stats = cluster_df.groupby('Cluster')['MEDV'].agg(['mean', 'std', 'min', 'max'])
cluster_target_stats

Unnamed: 0_level_0,mean,std,min,max
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,25.749848,8.335178,11.9,50.0
1,16.553107,7.610751,5.0,50.0


### Show cluster centers in original feature space

In [59]:
centers_df = pd.DataFrame(kmeans_final.cluster_centers_, columns=X.columns)
centers_df.index = [f'Cluster {i}' for i in range(optimal_k)]
centers_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
Cluster 0,-0.390124,0.262392,-0.620368,0.002912,-0.584675,0.243315,-0.435108,0.457222,-0.583801,-0.63146,-0.285808,0.326451,-0.446421
Cluster 1,0.725146,-0.487722,1.153113,-0.005412,1.086769,-0.452263,0.80876,-0.849865,1.085145,1.173731,0.531248,-0.606793,0.829787


# Level - 3

## Task 1 - Random Forest Classifier

### Define parameter grid for hyperparameter tuning

In [60]:
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

print(f"Testing {np.prod([len(v) for v in param_grid_rf.values()])} parameter combinations")

Testing 108 parameter combinations


### Create Random Forest classifier

In [61]:
rf_model = RandomForestClassifier(random_state=42)

In [62]:
grid_search_rf = GridSearchCV(
    rf_model, 
    param_grid_rf, 
    cv=5, 
    scoring='f1',
    n_jobs=-1,
    verbose=0
)

grid_search_rf.fit(X_train_clf, y_train_clf)

0,1,2
,estimator,RandomForestC...ndom_state=42)
,param_grid,"{'max_depth': [3, 5, ...], 'min_samples_leaf': [1, 2, ...], 'min_samples_split': [2, 5, ...], 'n_estimators': [50, 100, ...]}"
,scoring,'f1'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,50
,criterion,'gini'
,max_depth,7
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


### Best parameters

In [63]:
for param, value in grid_search_rf.best_params_.items():
    print(f"- {param}: {value}")

- max_depth: 7
- min_samples_leaf: 1
- min_samples_split: 5
- n_estimators: 50


In [64]:
grid_search_rf.best_score_

np.float64(0.8716698787896562)

### Train final model with best parameters

In [65]:
rf_best = grid_search_rf.best_estimator_
y_pred_rf = rf_best.predict(X_test_clf)

### Evaluate the model

In [66]:
accuracy_rf = accuracy_score(y_test_clf, y_pred_rf)
accuracy_rf

0.8823529411764706

In [67]:
f1_rf = f1_score(y_test_clf, y_pred_rf)
f1_rf

0.88

In [68]:
report_rf = classification_report(y_test_clf, y_pred_rf, output_dict=True)
report_rf

{'0': {'precision': 0.8846153846153846,
  'recall': 0.8846153846153846,
  'f1-score': 0.8846153846153846,
  'support': 52.0},
 '1': {'precision': 0.88, 'recall': 0.88, 'f1-score': 0.88, 'support': 50.0},
 'accuracy': 0.8823529411764706,
 'macro avg': {'precision': 0.8823076923076922,
  'recall': 0.8823076923076922,
  'f1-score': 0.8823076923076922,
  'support': 102.0},
 'weighted avg': {'precision': 0.8823529411764706,
  'recall': 0.8823529411764706,
  'f1-score': 0.8823529411764706,
  'support': 102.0}}

### Cross-validation scores for final model

In [69]:
cv_scores = cross_val_score(rf_best, X_train_clf, y_train_clf, cv=5, scoring='f1')
cv_scores

array([0.85365854, 0.85714286, 0.84931507, 0.89156627, 0.90666667])

### Feature importance

In [70]:
feature_importance_rf = pd.DataFrame({
    'Feature': X.columns,
    'Importance': rf_best.feature_importances_
}).sort_values('Importance', ascending=False)

print(feature_importance_rf)

    Feature  Importance
12    LSTAT    0.256356
5        RM    0.204407
4       NOX    0.109023
10  PTRATIO    0.091092
6       AGE    0.072001
9       TAX    0.058913
2     INDUS    0.054033
0      CRIM    0.047198
7       DIS    0.046275
11        B    0.030564
8       RAD    0.014431
3      CHAS    0.008236
1        ZN    0.007471


### Confusion matrix

In [71]:
cm_rf = confusion_matrix(y_test_clf, y_pred_rf)
cm_rf

array([[46,  6],
       [ 6, 44]])

## Task 2 - SVM for Classification

### Trying different kernels (linear, RBF)

In [72]:
kernels = ['linear', 'rbf']
svm_results = {}

for kernel in kernels:
    # Train SVM model
    if kernel == 'linear':
        svm_model = SVC(kernel=kernel, random_state=42, probability=True)
    else:  # rbf
        svm_model = SVC(kernel=kernel, random_state=42, probability=True, gamma='scale')
    
    svm_model.fit(X_train_clf, y_train_clf)
    
    # Make predictions
    y_pred_svm = svm_model.predict(X_test_clf)
    y_pred_proba_svm = svm_model.predict_proba(X_test_clf)[:, 1]
    
    # Calculate metrics
    accuracy_svm = accuracy_score(y_test_clf, y_pred_svm)
    report_svm = classification_report(y_test_clf, y_pred_svm, output_dict=True)
    
    # ROC AUC
    fpr_svm, tpr_svm, _ = roc_curve(y_test_clf, y_pred_proba_svm)
    auc_svm = auc(fpr_svm, tpr_svm)
    
    # Store results
    svm_results[kernel] = {
        'model': svm_model,
        'accuracy': accuracy_svm,
        'precision': report_svm['1']['precision'],
        'recall': report_svm['1']['recall'],
        'f1': report_svm['1']['f1-score'],
        'auc': auc_svm,
        'predictions': y_pred_svm,
        'probabilities': y_pred_proba_svm
    }
    
    print(f"{kernel:<10} {accuracy_svm:<10.4f} {report_svm['1']['precision']:<10.4f} "
          f"{report_svm['1']['recall']:<10.4f} {report_svm['1']['f1-score']:<10.4f} {auc_svm:<10.4f}")

linear     0.9216     0.9200     0.9200     0.9200     0.9608    
rbf        0.8922     0.9149     0.8600     0.8866     0.9662    


### Find best kernel based on F1-score

In [73]:
best_kernel = max(svm_results.keys(), key=lambda k: svm_results[k]['f1'])
best_kernel

'linear'

In [74]:
best_svm = svm_results[best_kernel]
best_svm['f1']

0.92

### Confusion matrix for best SVM

In [75]:
cm_svm = confusion_matrix(y_test_clf, best_svm['predictions'])
cm_svm

array([[48,  4],
       [ 4, 46]])

### For linear SVM, show feature coefficients

In [76]:
if best_kernel == 'linear':
    svm_coefficients = pd.DataFrame({
        'Feature': X.columns,
        'Coefficient': svm_results['linear']['model'].coef_[0],
        'Abs_Coefficient': np.abs(svm_results['linear']['model'].coef_[0])
    }).sort_values('Abs_Coefficient', ascending=False)
    
    print(f"\nLinear SVM Feature Coefficients:")
    print(svm_coefficients)


Linear SVM Feature Coefficients:
    Feature  Coefficient  Abs_Coefficient
12    LSTAT    -1.555815         1.555815
8       RAD     1.063890         1.063890
7       DIS    -0.999438         0.999438
5        RM     0.830568         0.830568
9       TAX    -0.830253         0.830253
4       NOX    -0.788268         0.788268
10  PTRATIO    -0.645193         0.645193
3      CHAS     0.400174         0.400174
1        ZN     0.289645         0.289645
6       AGE    -0.190102         0.190102
11        B     0.142542         0.142542
0      CRIM    -0.096138         0.096138
2     INDUS     0.092677         0.092677


### Hyperparameter tuning for RBF kernel

In [77]:
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto', 0.1, 1]
}

svm_rbf = SVC(kernel='rbf', random_state=42, probability=True)
grid_search_svm = GridSearchCV(svm_rbf, param_grid_svm, cv=3, scoring='f1')
grid_search_svm.fit(X_train_clf, y_train_clf)

print(f"Best RBF SVM parameters: {grid_search_svm.best_params_}")
print(f"Best RBF SVM CV F1-score: {grid_search_svm.best_score_}")

Best RBF SVM parameters: {'C': 10, 'gamma': 0.1}
Best RBF SVM CV F1-score: 0.8962090376657929


### Evaluate tuned RBF SVM

In [78]:
svm_tuned = grid_search_svm.best_estimator_
y_pred_svm_tuned = svm_tuned.predict(X_test_clf)
y_pred_proba_svm_tuned = svm_tuned.predict_proba(X_test_clf)[:, 1]

In [79]:
accuracy_svm_tuned = accuracy_score(y_test_clf, y_pred_svm_tuned)
accuracy_svm_tuned

0.8823529411764706

In [80]:
f1_svm_tuned = f1_score(y_test_clf, y_pred_svm_tuned)
f1_svm_tuned

0.875

In [81]:
fpr_tuned, tpr_tuned, _ = roc_curve(y_test_clf, y_pred_proba_svm_tuned)
auc_svm_tuned = auc(fpr_tuned, tpr_tuned)
auc_svm_tuned

0.9734615384615384

## Task 3 - Neural Network with Tensorflow/Keras

### Import TensorFlow and Keras

In [82]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

### Build a simple feedforward neural network

In [83]:
tf.random.set_seed(42)

model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_clf.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(16, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

### Compile the model

In [84]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy', 'precision', 'recall']
)

In [85]:
model.summary()

### Train the model

In [86]:
history = model.fit(
    X_train_clf, y_train_clf,
    validation_split=0.2,
    epochs=50,
    batch_size=32,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
    ]
)

Epoch 1/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 120ms/step - accuracy: 0.5542 - loss: 0.6870 - precision: 0.5471 - recall: 0.5813 - val_accuracy: 0.8395 - val_loss: 0.6010 - val_precision: 0.8649 - val_recall: 0.8000 - learning_rate: 0.0010
Epoch 2/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.6656 - loss: 0.6100 - precision: 0.6444 - recall: 0.7250 - val_accuracy: 0.8889 - val_loss: 0.5278 - val_precision: 0.8974 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 3/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - accuracy: 0.7461 - loss: 0.5691 - precision: 0.7097 - recall: 0.8250 - val_accuracy: 0.8889 - val_loss: 0.4571 - val_precision: 0.8974 - val_recall: 0.8750 - learning_rate: 0.0010
Epoch 4/50
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.7523 - loss: 0.5333 - precision: 0.7174 - recall: 0.8250 - val_accuracy: 0.9012 - val_loss: 0.3974 

### Evaluate the model

In [87]:
train_loss, train_acc, train_prec, train_rec = model.evaluate(X_train_clf, y_train_clf)

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9109 - loss: 0.2060 - precision: 0.9505 - recall: 0.8650


In [88]:
train_loss

0.2060384303331375

In [89]:
train_acc

0.9108911156654358

In [90]:
train_prec

0.9505494236946106

In [91]:
train_rec

0.8650000095367432

In [92]:
test_loss, test_acc, test_prec, test_rec = model.evaluate(X_test_clf, y_test_clf)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 0.9020 - loss: 0.2521 - precision: 0.9167 - recall: 0.8800


In [93]:
test_loss

0.2521234154701233

In [94]:
test_acc

0.9019607901573181

In [95]:
test_prec

0.9166666865348816

In [96]:
test_rec

0.8799999952316284

### Make predictions

In [97]:
y_pred_nn_prob = model.predict(X_test_clf, verbose=0)
y_pred_nn = (y_pred_nn_prob > 0.5).astype(int).flatten()

### Calculate F1-score and AUC

In [98]:
cm_nn = confusion_matrix(y_test_clf, y_pred_nn)
cm_nn

array([[48,  4],
       [ 6, 44]])