# Random Forest Regression 

no need for the feature scaling

In [1]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, pointbiserialr
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

In [3]:
#import the data set
# Import the dataset
math = pd.read_csv("./student-mat.csv", sep=';', header=0)
por = pd.read_csv("./student-por.csv", sep=';', header=0)

In [7]:
#feature selection 

from scipy.stats import pearsonr, pointbiserialr
from sklearn.preprocessing import LabelEncoder

# Specify the target column
target_column = 'G3'  # Target column name

# Separate numerical and categorical columns
numerical_features = math.select_dtypes(include=['number']).columns
categorical_features = math.select_dtypes(include=['object', 'category']).columns

# Dictionary to store correlation results
correlation_results = {}

# Handle numerical columns
for col in numerical_features:
    if col != target_column:
        correlation, p_value = pearsonr(math[col], math[target_column])
        correlation_results[col] = {'correlation': abs(correlation), 'p_value': p_value}

# Handle categorical columns
for col in categorical_features:
    if col != target_column:
        # Encode categorical values
        encoded_col = LabelEncoder().fit_transform(math[col])
        correlation, p_value = pointbiserialr(encoded_col, math[target_column])
        correlation_results[col] = {'correlation': abs(correlation), 'p_value': p_value}

# Sort features by correlation
sorted_features = sorted(correlation_results.items(), key=lambda x: x[1]['correlation'], reverse=True)

# Display top features with p-values
print("Feature Correlations and P-values with Target Variable:")
for feature, stats in sorted_features:
    print(f"{feature}: Correlation = {stats['correlation']:.2f}, P-value = {stats['p_value']:.3e}")

Feature Correlations and P-values with Target Variable:
G2: Correlation = 0.90, P-value = 7.626e-148
G1: Correlation = 0.80, P-value = 9.001e-90
failures: Correlation = 0.36, P-value = 1.466e-13
Medu: Correlation = 0.22, P-value = 1.336e-05
higher: Correlation = 0.18, P-value = 2.668e-04
age: Correlation = 0.16, P-value = 1.271e-03
Fedu: Correlation = 0.15, P-value = 2.380e-03
goout: Correlation = 0.13, P-value = 8.229e-03
romantic: Correlation = 0.13, P-value = 9.713e-03
reason: Correlation = 0.12, P-value = 1.527e-02
traveltime: Correlation = 0.12, P-value = 1.987e-02
address: Correlation = 0.11, P-value = 3.563e-02
sex: Correlation = 0.10, P-value = 3.987e-02
Mjob: Correlation = 0.10, P-value = 4.259e-02
paid: Correlation = 0.10, P-value = 4.277e-02
internet: Correlation = 0.10, P-value = 5.048e-02
studytime: Correlation = 0.10, P-value = 5.206e-02
schoolsup: Correlation = 0.08, P-value = 1.004e-01
famsize: Correlation = 0.08, P-value = 1.062e-01
guardian: Correlation = 0.07, P-valu

In [17]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd

# 'G3' is the target column
target_column = 'G3'

# Encode categorical columns (if any)
df_encoded = por.copy()
for col in categorical_features:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Separate features and target
X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fit Random Forest Regressor
regressor = RandomForestRegressor(random_state=0, n_estimators=30)
regressor.fit(X_train, y_train)

# Get feature importances
importances = regressor.feature_importances_

# Map feature names to their importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display top features
print("Feature Importance from Random Forest:")
print(feature_importance)

# Predict using the test data
y_pred = regressor.predict(X_test)

# Display predictions vs actual values
np.set_printoptions(precision=2)  # Display only 2 decimals after the column for the numerical values
comparison = np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.values.reshape(len(y_test), 1)), axis=1)  # Fixed the axis argument
print(comparison)

# Evaluating the model performance

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)



Feature Importance from Random Forest:
       Feature  Importance
31          G2    0.835089
29    absences    0.036838
30          G1    0.023642
2          age    0.008134
16      famsup    0.007449
26        Dalc    0.006803
23      famrel    0.005973
8         Mjob    0.005840
27        Walc    0.005707
28      health    0.005601
0       school    0.005455
24    freetime    0.005168
12  traveltime    0.004902
10      reason    0.004838
25       goout    0.004790
7         Fedu    0.004450
6         Medu    0.003533
11    guardian    0.003445
14    failures    0.003154
15   schoolsup    0.002635
9         Fjob    0.002288
18  activities    0.001928
1          sex    0.001863
13   studytime    0.001712
21    internet    0.001666
4      famsize    0.001596
3      address    0.001123
20      higher    0.001067
22    romantic    0.001046
19     nursery    0.000890
17        paid    0.000820
5      Pstatus    0.000553
[[ 6.9   8.  ]
 [14.87 15.  ]
 [15.83 16.  ]
 [10.2  10.  ]
 [ 9.7  10

0.8355617378556673

r2 is about 87% by n_estimators = 30, 84% by n_estimators = 10

# SVR with selected features

In [21]:
# Encode categorical columns (if any)
df_encoded = math.copy()

numerical_features = por.select_dtypes(include=['number']).columns
categorical_features = por.select_dtypes(include=['object', 'category']).columns

for col in categorical_features:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Define the target column and features
X = df_encoded[['G3', 'Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 'sex', 'Mjob', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher']]
y = df_encoded[target_column]

# Handle categorical features with encoding
#X_encoded = pd.get_dummies(X, drop_first=True)  # One-hot encoding for categorical features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Random Forest Regressor
regressor = RandomForestRegressor(random_state=0, n_estimators=30)
regressor.fit(X_train, y_train)

# Get feature importances
importances = regressor.feature_importances_

# Map feature names to their importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display top features
print("Feature Importance from Random Forest:")
print(feature_importance)


# Predict using the test data
y_pred = regressor.predict(X_test)


# Evaluating the model performance

from sklearn.metrics import r2_score
r2_score(y_test, y_pred)


Feature Importance from Random Forest:
       Feature  Importance
0           G3    0.993123
5     absences    0.005914
7           G2    0.000674
2     failures    0.000094
4         Walc    0.000053
1         Medu    0.000045
9         Mjob    0.000037
12  activities    0.000027
11      famsup    0.000015
6           G1    0.000009
8          sex    0.000009
3         Dalc    0.000000
10   schoolsup    0.000000
13     nursery    0.000000
14      higher    0.000000


0.9997901103366361

K-Fold cross validation

In [23]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier  # Example model
from sklearn.metrics import make_scorer

# Assuming X (features) and y (target) are already defined
regressor = RandomForestClassifier(n_estimators=30, random_state = 0)

# Perform K-fold Cross-Validation (e.g., 5 folds)
cv_scores = cross_val_score(regressor, X, y, cv=30, scoring='accuracy')

# Print the results
print(f"Accuracy for each fold: {cv_scores}")
print(f"Average accuracy: {cv_scores.mean()}")



Accuracy for each fold: [0.71 0.79 0.71 0.79 0.64 0.77 0.69 0.92 0.85 0.69 0.77 0.92 0.92 0.85
 0.77 0.92 0.77 0.77 0.77 0.85 0.77 0.77 0.85 0.85 0.92 0.85 0.92 0.92
 0.85 0.85]
Average accuracy: 0.8137362637362638


# Hyperparameter tuning

In [29]:
# Import the necessary libraries
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score

# Encode categorical columns (if any)
df_encoded = math.copy()

# Apply Label Encoding to categorical features
categorical_features = df_encoded.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    df_encoded[col] = LabelEncoder().fit_transform(df_encoded[col])

# Define the target column and features
target_column = "G3"
X = df_encoded[['Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 'sex', 'Mjob', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher']]
y = df_encoded[target_column]

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# K-Fold Cross-Validation with RandomForest Regressor
regressor = RandomForestRegressor(random_state=0, n_estimators=10, n_jobs=1)  # Set n_jobs=1 for debugging

# Perform K-fold Cross Validation (e.g., 3 folds for debugging)
cv_scores = cross_val_score(regressor, X_train, y_train, cv=3, scoring='r2')
print(f"Cross-validation R² Scores: {cv_scores}")
print(f"Average Cross-validation R² Score: {cv_scores.mean():.4f}")

# Hyperparameter Tuning with RandomizedSearchCV
# Define the parameter grid
param_dist = {
    'n_estimators': [10, 100],  # Reduced number of estimators for debugging
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform RandomizedSearchCV with 3-fold cross-validation for debugging
random_search = RandomizedSearchCV(estimator=regressor, param_distributions=param_dist, cv=3, scoring='r2', n_iter=5, random_state=42, n_jobs=1)
random_search.fit(X_train, y_train)

# Print the best parameters and best score from the RandomizedSearchCV
print(f"Best Parameters from RandomizedSearchCV: {random_search.best_params_}")
print(f"Best R² Score from RandomizedSearchCV: {random_search.best_score_:.4f}")

# Train the model using the best parameters found by RandomizedSearchCV
best_regressor = random_search.best_estimator_

# Fit the model on the entire training set with the best parameters
best_regressor.fit(X_train, y_train)

# Predict using the test data
y_pred = best_regressor.predict(X_test)

# Evaluating the model performance
r2 = r2_score(y_test, y_pred)
print(f"R² Score on Test Set: {r2:.4f}")

# Feature Importances
importances = best_regressor.feature_importances_
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': importances
})

# Sort features by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Display top features
print("Feature Importance from Random Forest:")
print(feature_importance)


Cross-validation R² Scores: [0.87 0.83 0.9 ]
Average Cross-validation R² Score: 0.8691
Best Parameters from RandomizedSearchCV: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 10}
Best R² Score from RandomizedSearchCV: 0.8772
R² Score on Test Set: 0.8449
Feature Importance from Random Forest:
       Feature  Importance
6           G2    0.799906
4     absences    0.121834
8         Mjob    0.011487
5           G1    0.011289
3         Walc    0.010859
9    schoolsup    0.010452
1     failures    0.010067
11  activities    0.007213
0         Medu    0.004840
12     nursery    0.003802
2         Dalc    0.003374
7          sex    0.002545
10      famsup    0.001898
13      higher    0.000432


Random forest performance (r2): 
with all features 0.8434608727973889 for math, 0.8253073565669354 for por
with feature selection 0.9997901103366361 for math, 0.9994539641702332 for por
with hyperparameter tuning  0.8449 for math, 0.8120 for por