# For training a model, we are using data (a public dataset)

In [None]:
import pandas as pd
import numpy as np

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student-mat.csv"
df = pd.read_csv(url, sep=';')  # UCI's dataset uses ';' as separator

print(df)


In [None]:

# # Example for collecting custom data: User input simulation (grades, course preferences)
# # This could be replaced with actual input collection from a web form or API
# def collect_custom_data():
#     data = {
#         "education": "BS in Computer Science",
#         "skills": ["Python", "JavaScript", "TypeScript", "Java"],
#         "interests": ["Machine Learning", "Blockchain", "Dart", "Algorithms"],
#         "timeline": 2  # expected/targeted years of education
#     }
#     return pd.DataFrame(data)


# # Collect custom data
# custom_data = collect_custom_data()


# # Merge custom data with public data (if needed)
# df = pd.concat([df, custom_data], axis=1)


# Handling missing values

In [None]:
# Check for missing values
missing_values = df.isnull().sum()


# Handle missing values (e.g., fill missing with the mean or drop rows/columns)
# Here we fill missing values with the mean (for numeric columns)
df.fillna(df.mean(), inplace=True)

# Performed data-preprocessing (with StandardScaler and MinMaxScaler)

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

# Standardize numerical columns (mean=0, std=1)
scaler = StandardScaler()
df[['grade_math', 'grade_portuguese', 'study_time']] = scaler.fit_transform(df[['grade_math', 'grade_portuguese', 'study_time']])


# Alternatively, you could normalize them (min=0, max=1)
min_max_scaler = MinMaxScaler()
df[['grade_math', 'grade_portuguese']] = min_max_scaler.fit_transform(df[['grade_math', 'grade_portuguese']])

# Encoding (LabelEncoder and One Hot encoding)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Encoding categorical data: 'course_interests'
le = LabelEncoder()
df['course_interests_encoded'] = le.fit_transform(df['course_interests'])

# One-hot encoding example for categorical features with more than two categories
df = pd.get_dummies(df, columns=['course_interests'], drop_first=True)


# Feature Engineering

In [None]:
# Assuming we have grades for different subjects, we can calculate the GPA
df['GPA'] = df[['grade_math', 'grade_portuguese']].mean(axis=1)

# Feature: Total Study Time (Sum of study time in different courses)
df['total_study_time'] = df['study_time'] * 2  # Just as an example, you can make it more complex

# Another example: Creating a new feature from 'extra_activities'
df['is_active_student'] = np.where(df['extra_activities'] == 1, 'Yes', 'No')

# Display the updated dataframe
print(df.head())


# coninued...

# Train Models

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, mean_squared_error
import xgboost as xgb


# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_performance, test_size=0.3, random_state=42)
X_train_career, X_test_career, y_train_career, y_test_career = train_test_split(X_career, y_career, test_size=0.3, random_state=42)

# Standardize/Scale data (important for many ML models like Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

* Performance Prediction (Regression)

In [None]:
# Step 2: Performance Prediction (Regression)

# Model 1: Random Forest Regressor for GPA prediction
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)
rf_regressor.fit(X_train_scaled, y_train)

# Predict performance (GPA)
y_pred_performance = rf_regressor.predict(X_test_scaled)

# Evaluate the model using Mean Squared Error
mse = mean_squared_error(y_test, y_pred_performance)
print(f"Performance Prediction (MSE): {mse}")

- Career Path Prediction (Classification)

In [None]:

# Step 3: Career Path Prediction (Classification)

# Model 2: Random Forest Classifier for Career Path Prediction
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_scaled, y_train_career)

# Predict career path
y_pred_career = rf_classifier.predict(X_test_scaled)

# Evaluate the model using Accuracy
accuracy = accuracy_score(y_test_career, y_pred_career)
print(f"Career Path Prediction (Accuracy): {accuracy}")

- Use XGBoost (as an alternative to Random Forest)

In [None]:

# Step 4: Use XGBoost (as an alternative to Random Forest)
# Model 3: XGBoost Classifier for Career Path Prediction
xgb_classifier = xgb.XGBClassifier(random_state=42)
xgb_classifier.fit(X_train_scaled, y_train_career)

# Predict career path using XGBoost
y_pred_career_xgb = xgb_classifier.predict(X_test_scaled)

# Evaluate the XGBoost model
accuracy_xgb = accuracy_score(y_test_career, y_pred_career_xgb)
print(f"XGBoost Career Path Prediction (Accuracy): {accuracy_xgb}")

- Logistic Regression Model for Career Path Prediction

In [None]:
# Step 5: Logistic Regression Model for Career Path Prediction
# Model 4: Logistic Regression for Career Path Prediction
log_reg = LogisticRegression(random_state=42)
log_reg.fit(X_train_scaled, y_train_career)

# Predict career path
y_pred_career_logreg = log_reg.predict(X_test_scaled)

# Evaluate Logistic Regression model
accuracy_logreg = accuracy_score(y_test_career, y_pred_career_logreg)
print(f"Logistic Regression Career Path Prediction (Accuracy): {accuracy_logreg}")