In [2]:
# Import the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, pointbiserialr
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import LabelEncoder

ModuleNotFoundError: No module named 'numpy'

In [52]:
# Import the dataset
math = pd.read_csv("./student-mat.csv", sep=';', header=0)
por = pd.read_csv("./student-por.csv", sep=';', header=0)

In [53]:
#correlation of the dependent variables with the independent variables
#########   Correlation with the method from David Boules


# F-statistics for correlation 
#target features is the final grade G3

target = "G3"
# Separate numerical and categorical columns
numerical_features = por.select_dtypes(include=['number']).columns
categorical_features = por.select_dtypes(include=['object', 'category']).columns

X = por.drop(columns = [target])
y = por[target]

# one way ANOVA test for the correlation of means between two or more groups
# assumption the data is normally distributed


from scipy.stats import f_oneway

#correlation between the categorical variables and the target variable of G3(final grading)
from sklearn.feature_selection import f_classif 
for col in categorical_features: 
    groups = [y[X[col] == value] for value in X[col].unique()]
    f_stat, p_value = f_oneway(*groups)
    print(f"Feature: {col}, F_statistic:{f_stat}, P-value: {p_value}")

# filter numerical features
numerical_features= X.select_dtypes(include = [np.number]).columns

#perform F-test

f_scores, p_values = f_classif(X[numerical_features], y)

#display the results

for col, f_score, p_value in zip(numerical_features, f_scores, p_values):
    print(f"Feature: {col}, F_statistic:{f_stat}, P-value: {p_value}")

selected_features_num = [col for col, p_value in zip(numerical_features, p_values) if p_value < 0.05]
selected_features_cat = [col for col, p_value in zip(categorical_features, p_values) if p_value < 0.05]

print("Selected Features:", selected_features_num, selected_features_cat)

Feature: school, F_statistic:56.89067686337133, P-value: 1.5661990923002604e-13
Feature: sex, F_statistic:10.962308407124874, P-value: 0.0009815287061373317
Feature: address, F_statistic:18.707910527412754, P-value: 1.7641534609222437e-05
Feature: famsize, F_statistic:1.3137906447644496, P-value: 0.2521332216658279
Feature: Pstatus, F_statistic:0.0003677569375753126, P-value: 0.984705825951084
Feature: Mjob, F_statistic:7.370224291121831, P-value: 8.30514988494739e-06
Feature: Fjob, F_statistic:3.2726805958419667, P-value: 0.011376280623605892
Feature: reason, F_statistic:10.248465509132915, P-value: 1.3416422874904278e-06
Feature: guardian, F_statistic:2.63816697870449, P-value: 0.07226239503367116
Feature: schoolsup, F_statistic:2.865641360496058, P-value: 0.09097103846579366
Feature: famsup, F_statistic:2.2759054076786045, P-value: 0.1318865120420208
Feature: paid, F_statistic:1.955810225639281, P-value: 0.16244124863569093
Feature: activities, F_statistic:2.321335165021925, P-value

In [54]:
# Encode the categorical variables and scale the numerical variables

from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
from sklearn.metrics import r2_score

# Define the target column and features
target_column = "G3"
X = por.drop(columns=[target_column])
X = X[['Medu', 'failures', 'Dalc', 'Walc', 'absences', 'G1', 'G2', 'sex', 'Mjob', 'schoolsup', 'famsup', 'activities', 'nursery', 'higher']]
y = por[target_column]

# Apply the scaler 
numerical_cols = X.select_dtypes(include=['number']).columns
sc_X = StandardScaler()
X[numerical_cols] = sc_X.fit_transform(X[numerical_cols])

#scaling the y is not necessary
#sc_y = StandardScaler()
#y = sc_y.fit_transform(y)

# Encode categorical columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns
for col in categorical_features:
    X[col] = LabelEncoder().fit_transform(X[col])


In [55]:
#splitting the dataset into the training and the test set

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


In [56]:
# train the model
from xgboost import XGBRegressor
regressor = XGBRegressor()
regressor.fit(X_train, y_train)

In [57]:
import numpy as np
from sklearn.metrics import r2_score

# Assuming `regressor` is already trained and `X_test` and `y_test` are defined

# Make predictions using the trained model
y_pred = regressor.predict(X_test)

# Convert y_test to a NumPy array and ensure both y_pred and y_test are reshaped into 2D arrays
y_pred = y_pred.reshape(-1, 1)  # Reshape to (n_samples, 1)

# Convert y_test to NumPy array and reshape
y_test = y_test.to_numpy().reshape(-1, 1)  # Convert y_test to NumPy array and reshape

# Concatenate and print the predictions vs actual values
np.set_printoptions(precision=2, suppress=True)  # Suppress scientific notation
comparison = np.concatenate((y_pred, y_test), axis=1)  # Concatenate along columns (axis=1)
print("Predictions vs Actual:")
print(comparison)

# Evaluating the model performance on the test set
r2_test_score = r2_score(y_test, y_pred)
print(f"R² score on the test set: {r2_test_score}")

Predictions vs Actual:
[[ 7.69  8.  ]
 [15.12 15.  ]
 [15.84 16.  ]
 [10.74 10.  ]
 [ 9.22 10.  ]
 [12.87 12.  ]
 [12.35 13.  ]
 [17.38 17.  ]
 [11.88 12.  ]
 [11.49 12.  ]
 [10.51 11.  ]
 [10.51 10.  ]
 [13.42 13.  ]
 [ 8.5   8.  ]
 [18.08 18.  ]
 [13.06 12.  ]
 [13.22 13.  ]
 [12.69 13.  ]
 [11.1  10.  ]
 [10.15 10.  ]
 [12.46 12.  ]
 [11.23 10.  ]
 [17.54 17.  ]
 [13.12 15.  ]
 [12.75 14.  ]
 [ 1.72  0.  ]
 [13.15 12.  ]
 [12.77 14.  ]
 [11.65 12.  ]
 [12.85  9.  ]
 [13.43 13.  ]
 [16.29 16.  ]
 [12.48 13.  ]
 [15.99 16.  ]
 [13.62 12.  ]
 [ 9.41 10.  ]
 [ 8.83 10.  ]
 [11.4  11.  ]
 [13.39 13.  ]
 [11.56 10.  ]
 [15.36 15.  ]
 [18.07 18.  ]
 [11.71 11.  ]
 [13.77 13.  ]
 [12.51 13.  ]
 [10.56 10.  ]
 [13.62 14.  ]
 [10.4   9.  ]
 [11.68 11.  ]
 [ 8.82 10.  ]
 [ 7.18  8.  ]
 [15.08 17.  ]
 [10.72  9.  ]
 [13.05 13.  ]
 [ 8.37  8.  ]
 [ 9.93 11.  ]
 [11.19 12.  ]
 [10.4  12.  ]
 [14.17 15.  ]
 [15.41 15.  ]
 [13.9  13.  ]
 [ 6.45  7.  ]
 [11.7  12.  ]
 [10.44 10.  ]
 [13.13 12.  ]
 [

In [58]:
## Applying k-Fold Cross Validation

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 74.68 %
Standard Deviation: 8.05 %


In [64]:
import joblib
from xgboost import XGBRegressor

# Example: Train a model
model = XGBRegressor()
# Assuming you have some training data X_train and y_train
# model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(model, '../web-app/cholpon-zhakshylykova/regression_model.pkl')  # Save model using joblib


FileNotFoundError: [Errno 2] No such file or directory: '../web-app/cholpon-zhakshylykova/regression_model.pkl'