In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score
import pickle

# Load the dataset
df = pd.read_csv('International_Education_Costs.csv')

# Data cleaning
cost_cols = ['Tuition_USD','Living_Cost_Index','Rent_USD','Visa_Fee_USD','Insurance_USD']
df = df.dropna(subset=cost_cols)
df = df[(df[cost_cols] != 0).all(axis=1)]

# Calculate total annual cost
def estimate_annual_cost(row):
    tuition = row['Tuition_USD']
    living = row['Living_Cost_Index'] / 100 * 12000  # baseline $12k/year
    rent = row['Rent_USD'] * 12
    visa = row['Visa_Fee_USD']
    insurance = row['Insurance_USD']
    return tuition + living + rent + visa + insurance

df['Estimated_Annual_Cost'] = df.apply(estimate_annual_cost, axis=1)

# Define features and target
X = df[['Country', 'Level', 'Program', 'Duration_Years', 'Living_Cost_Index', 'Exchange_Rate']]
y = df['Estimated_Annual_Cost']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for categorical and numerical features
categorical_features = ['Country', 'Level', 'Program']
numerical_features = ['Duration_Years', 'Living_Cost_Index', 'Exchange_Rate']

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

# Define models to try
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Train and evaluate models
results = {}
for name, model in models.items():
    # Create pipeline with preprocessing and model
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])

    # Train the model
    pipeline.fit(X_train, y_train)

    # Make predictions
    y_pred = pipeline.predict(X_test)

    # Evaluate the model
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results[name] = {'MAE': mae, 'R2': r2, 'pipeline': pipeline}
    print(f"{name} - MAE: ${mae:.2f}, R2: {r2:.4f}")

# Select the best model based on MAE
best_model_name = min(results, key=lambda x: results[x]['MAE'])
best_model = results[best_model_name]['pipeline']

print(f"\nBest model: {best_model_name}")
print(f"MAE: ${results[best_model_name]['MAE']:.2f}")
print(f"R2: {results[best_model_name]['R2']:.4f}")

# Save the model
with open('model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

print("\nModel saved as 'model.pkl'")

Linear Regression - MAE: $3944.71, R2: 0.9485
Random Forest - MAE: $3086.98, R2: 0.9633
Gradient Boosting - MAE: $3871.97, R2: 0.9532

Best model: Random Forest
MAE: $3086.98
R2: 0.9633

Model saved as 'model.pkl'


# Task
Load the saved model from 'model.pkl', create sample data for a new student's profile, and predict their estimated annual cost.

## Load Model

### Subtask:
Load the saved 'model.pkl' which contains the best-performing model (Random Forest).


**Reasoning**:
To load the saved model, I need to use the `pickle` library to open and deserialize the `model.pkl` file. The previous cell already imported `pickle`, so I will proceed directly to loading the model.



In [15]:
with open('model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)

print("Model loaded successfully as 'loaded_model'")

Model loaded successfully as 'loaded_model'


## Create Sample Data for Prediction

### Subtask:
Define a new data point with features similar to the training data (e.g., Country, Level, Program, Duration_Years, Living_Cost_Index, Exchange_Rate) to simulate a new student's profile.


**Reasoning**:
To create a sample data point for prediction, I will define a dictionary with the required features and then convert it into a pandas DataFrame, ensuring it has a single row and correct column names.



In [16]:
sample_student_data = {
    'Country': ['USA'],
    'Level': ['Master'],
    'Program': ['Computer Science'],
    'Duration_Years': [2.0],
    'Living_Cost_Index': [100.0],
    'Exchange_Rate': [1.0]
}
sample_df = pd.DataFrame(sample_student_data)

print("Sample student data created successfully:")
print(sample_df)

Sample student data created successfully:
  Country   Level           Program  Duration_Years  Living_Cost_Index  \
0     USA  Master  Computer Science             2.0              100.0   

   Exchange_Rate  
0            1.0  


**Reasoning**:
Now that the sample student data is prepared in a DataFrame, I will use the `loaded_model` (which is the best-performing model) to predict the estimated annual cost for this student. The `predict` method of the loaded pipeline will handle the necessary preprocessing and model inference.



In [17]:
predicted_cost = loaded_model.predict(sample_df)

print(f"Predicted Estimated Annual Cost for the sample student: ${predicted_cost[0]:,.2f}")

Predicted Estimated Annual Cost for the sample student: $100,509.09


## Summary:

### Q&A
The estimated annual cost for the new student with the profile: Country: USA, Level: Master, Program: Computer Science, Duration\_Years: 2.0, Living\_Cost\_Index: 100.0, and Exchange\_Rate: 1.0, is \$100,509.09.

### Data Analysis Key Findings
*   The pre-trained model, a Random Forest regressor, was successfully loaded from `model.pkl`.
*   A sample student profile was created for prediction, featuring 'USA' as the Country, 'Master' as the Level, 'Computer Science' as the Program, 2.0 years for Duration, a Living\_Cost\_Index of 100.0, and an Exchange\_Rate of 1.0.
*   The loaded model predicted an estimated annual cost of \$100,509.09 for this specific sample student.

### Insights or Next Steps
*   The successfully loaded model can now be integrated into an application or tool to provide estimated annual cost predictions for prospective students based on their profile.
*   Consider developing a user interface for easy input of student profile data and immediate display of the predicted cost, enhancing accessibility for advisors or students.
