In [20]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

BASE_URL = "http://127.0.0.1:8000"

# 1. Fetch Data from API
def fetch_data(endpoint):
    response = requests.get(f"{BASE_URL}/{endpoint}/")
    response.raise_for_status()
    return response.json()

# Fetch data for freelancers and projects
freelancers = fetch_data("freelancers")
projects = fetch_data("projects")

# 2. Load Data into DataFrames
freelancers_df = pd.DataFrame(freelancers)
clients_df = pd.DataFrame(projects)

# Preview the data
print(freelancers_df.head())
print(clients_df.head())

# 3. Data Preprocessing
# Clean data (if necessary) and feature engineering
print()
freelancers_df['id'] = freelancers_df['id'].astype(int)
clients_df['project_id'] = pd.to_numeric(clients_df['project_id'], errors='coerce').fillna(0).astype(int)

# Merging freelancers with client data (projects associated with freelancers)
merged_df = pd.merge(freelancers_df, clients_df, left_on="id", right_on="project_id", how="left")

# Check columns after merge
print(merged_df.columns)

# Handle missing data (fill or drop)
merged_df = merged_df.fillna({'hourly_rate': merged_df['hourly_rate'].mean()})

# Feature engineering: Create new features based on existing ones (if necessary)
# Example: Let's assume we want to predict the freelancer's hourly_rate
merged_df['phone_length'] = merged_df['phone_x'].apply(lambda x: len(x) if isinstance(x, str) else 0)

# Example scatter plot: Hourly rate vs. phone length (as a feature)
plt.figure(figsize=(10, 6))
sns.scatterplot(x=merged_df['phone_length'], y=merged_df['hourly_rate'])
plt.title("Hourly Rate vs. Phone Length")
plt.xlabel("Phone Length")
plt.ylabel("Hourly Rate")
plt.show()

# Define features and target
X = merged_df[['phone_length']]  # Here, we use 'phone_length' as an example feature
y = merged_df['hourly_rate']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 6. Model Evaluation
# Make predictions
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Plot predictions vs actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color="red", linewidth=2)
plt.title("Actual vs Predicted Hourly Rate")
plt.xlabel("Actual Hourly Rate")
plt.ylabel("Predicted Hourly Rate")
plt.show()


  first_name last_name                     email                  phone  \
0    Caitlin    Reilly        ryan88@example.net  +1-298-784-5944x83528   
1     Hayden    Miller      ihiggins@example.net   +1-254-330-3900x5335   
2       Joel    Wilson  scottshannon@example.com        +1-281-504-2681   
3      Frank      Gray     lindsay31@example.net           945.554.2495   
4      Vicki   Walters    andrecarey@example.org           483-762-3710   

       skills  hourly_rate   id  
0  JavaScript        93.03  123  
1  Kubernetes        36.70  124  
2       React        24.95  125  
3  Kubernetes        59.57  126  
4  JavaScript        62.89  127  
                                     name      field  \
0            facilitate back-end eyeballs      Trade   
1  productize out-of-the-box web services      Trade   
2         matrix cutting-edge convergence      Trade   
3        engineer innovative applications      Trade   
4           visualize leading-edge models  Transport   

        

KeyError: 'project_id'