In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Load your data
df = pd.read_excel("Survey cleaned.xlsx")
df.columns = df.columns.str.strip()

# Define the stress level formula function
def calculate_stress_level(row):
    stress_level = (
        (1 - row['Extraverted, enthusiastic.']) + 
        row['Anxious, easily upset.'] + 
        (1 - row['Open to new experiences, complex.']) + 
        row['Reserved, quiet.'] + 
        (1 - row['Sympathetic, warm.']) + 
        row['Disorganized, careless.'] + 
        (1 - row['Calm, emotionally stable']) + 
        row['Conventional, uncreative.']
    ) / 8
    return stress_level

# Apply the formula to create the target variable
df['Stress Level'] = df.apply(calculate_stress_level, axis=1)

# Identify all categorical columns
categorical_columns = ["The name of your institution", "The name of your program of study", "Your current class level is", 
                       "Your gender", 'Living with family?', 'Are you happy with your academic  Condition?', 
                       'Are you addicted to any drugs?', 'Are you in a relationship?']
categorical_columns = [col.strip() for col in categorical_columns]

# Ensure all columns are properly encoded
data = pd.get_dummies(df, columns=categorical_columns)

# Drop any non-numeric columns
non_numeric_columns = data.select_dtypes(include=['object']).columns
data = data.drop(columns=non_numeric_columns)

# Split the features and target
X = data.drop('Stress Level', axis=1)
y = data['Stress Level']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train the XGBoost Regressor
model = XGBRegressor()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")

# Save the model and scaler
joblib.dump(model, 'stress_prediction_model_xgboost.pkl')
joblib.dump(scaler, 'scaler_xgboost.pkl')

# Save column names
X_columns = X.columns
joblib.dump(X_columns, 'X_columns.pkl')

Mean Squared Error: 0.0875632987979544
Mean Absolute Error: 0.20256586513765482
R-squared: 0.9042653210688949


['X_columns.pkl']

In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.1/124.9 MB 812.7 kB/s eta 0:02:34
   ---------------------------------------- 0.1/124.9 MB 871.5 kB/s eta 0:02:24
   ---------------------------------------- 0.2/124.9 MB 952.6 kB/s eta 0:02:11
   ---------------------------------------- 0.2/124.9 MB 1.2 MB/s eta 0:01:48
   ---------------------------------------- 0.3/124.9 MB 983.0 kB/s eta 0:02:07
   ---------------------------------------- 0.3/124.9 MB 1.0 MB/s eta 0:02:03
   ---------------------------------------- 0.4/124.9 MB 1.1 MB/s eta 0:01:58
   ---------------------------------------- 0.5/124.9 MB 1.2 MB/s eta 0:01:46
   ---------------------------------------- 0.6/124.9 MB 1.2 MB/s eta 0:01:41
