In [106]:
import pandas as pd
import  numpy as np

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

import  joblib

In [107]:
salary_data = pd.read_csv("C:/Users/HP/Downloads/archive (30)/ds_salaries.csv")
salary_data.head()

Unnamed: 0.1,Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,0,2020,MI,FT,Data Scientist,70000,EUR,79833,DE,0,DE,L
1,1,2020,SE,FT,Machine Learning Scientist,260000,USD,260000,JP,0,JP,S
2,2,2020,SE,FT,Big Data Engineer,85000,GBP,109024,GB,50,GB,M
3,3,2020,MI,FT,Product Data Analyst,20000,USD,20000,HN,0,HN,S
4,4,2020,SE,FT,Machine Learning Engineer,150000,USD,150000,US,50,US,L


In [108]:
salary_data = salary_data[['experience_level', 'employment_type','job_title', 'salary_in_usd', "company_size"]]
salary_data.head()

Unnamed: 0,experience_level,employment_type,job_title,salary_in_usd,company_size
0,MI,FT,Data Scientist,79833,L
1,SE,FT,Machine Learning Scientist,260000,S
2,SE,FT,Big Data Engineer,109024,M
3,MI,FT,Product Data Analyst,20000,S
4,SE,FT,Machine Learning Engineer,150000,L


In [109]:
# Ordinal eencoder to encode experience level
encoder = OrdinalEncoder(categories=[['EN', 'MI', 'SE', 'EX']])
salary_data["experience_level_encoded"] = encoder.fit_transform(salary_data[["experience_level"]])

encoder = OrdinalEncoder(categories=[['S', 'M', 'L']])
salary_data["company_size_encoded"] = encoder.fit_transform(salary_data[["company_size"]])

#encode employment type using dummy columns
salary_data = pd.get_dummies(salary_data, columns=["employment_type", "job_title"], drop_first=True, dtype=int)

salary_data = salary_data.drop(columns= ['experience_level', 'company_size'])

In [110]:
salary_data.head()

Unnamed: 0,salary_in_usd,experience_level_encoded,company_size_encoded,employment_type_FL,employment_type_FT,employment_type_PT,job_title_AI Scientist,job_title_Analytics Engineer,job_title_Applied Data Scientist,job_title_Applied Machine Learning Scientist,...,job_title_Machine Learning Manager,job_title_Machine Learning Scientist,job_title_Marketing Data Analyst,job_title_NLP Engineer,job_title_Principal Data Analyst,job_title_Principal Data Engineer,job_title_Principal Data Scientist,job_title_Product Data Analyst,job_title_Research Scientist,job_title_Staff Data Scientist
0,79833,1.0,2.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,260000,2.0,0.0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,109024,2.0,1.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,20000,1.0,0.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,150000,2.0,2.0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
X = salary_data.drop(columns=["salary_in_usd"])
y = salary_data["salary_in_usd"]

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=104, test_size=0.2, shuffle=True)

In [112]:
salary_data.select_dtypes(include=['object']).columns.tolist()

[]

In [113]:

regr = linear_model.LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)


print("Coefficients: \n", regr.coef_)

#print the MSE
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))

#print the adjusted R2 value
print("R2: %.2f" % r2_score(y_test, y_pred))

Coefficients: 
 [ 3.89891601e+04  7.56505500e+03 -1.22322974e+05 -8.50203748e+04
 -9.36493326e+04  6.91298217e+04  8.73283022e+04  1.50961907e+05
  1.26014074e+04  6.45029992e+04  4.66758822e+04  3.13607053e+04
  5.51927873e+04  9.11144622e+04  1.03895454e+04  1.22246852e+05
  6.12702968e+04  5.35009072e+04  1.45519152e-10  7.57553547e+04
  1.34969626e+05  7.85140631e+04  8.52152505e+04  7.20923965e+04
  3.07292322e+04  1.08017048e+05  8.04501655e+04  1.04407827e+05
  1.39407827e+05  9.01257321e+04  4.09190422e+04  1.30382719e+03
  4.28396987e+05  8.36463147e+04  7.60883159e+04 -2.05423329e+04
  5.37069072e+04  1.01410896e+05  7.26375839e+04  3.49048822e+04
  9.97540177e+04  1.32516257e+05  7.81576777e+04  7.15889706e+04
  5.65118272e+04  1.54160637e+05  2.80618272e+04 -7.73070497e-11
  6.85270972e+04  3.23695775e+05  1.37817232e+05 -1.00195776e+03
  1.04562786e+05 -3.30474926e+04]
Mean squared error: 6412074606.94
R2: 0.05


In [128]:
# save model using joblib
joblib.dump("lin_regress", "myfile.pkl")

['myfile.pkl']

In [133]:
from fastapi import FastAPI
from pydantic import BaseModel
import uvicorn 
import asyncio

In [138]:
app = FastAPI()

# Define the request body format for predictions
class PredictionFeatures(BaseModel):
    experience_level_encoded: float
    company_size_encoded: float
    employment_type_PT: int
    job_title_Data_Engineer: int
    job_title_Data_Manager: int
    job_title_Data_Scientist: int
    job_title_Machine_Learning_Engineer: int

# Global variable to store the loaded model
model = None

# Download the model
def download_model():
    global model
    model = joblib.load('myfile.pkl')

# Download the model immediately when the script runs
download_model()


# API Root endpoint
@app.get("/")
async def index():
    return {"message": "Welcome to the Data Science Income API. Use the /predict feature to predict your income."}

# Prediction endpoint
@app.post("/predict")
async def predict(features: PredictionFeatures):
    
    # Create input DataFrame for prediction
    input_data = pd.DataFrame([{
        "experience_level_encoded": features.experience_level_encoded,
        "company_size_encoded": features.company_size_encoded,
        "employment_type_PT": features.employment_type_PT,
        "job_title_Data Engineer": features.job_title_Data_Engineer,
        "job_title_Data Manager": features.job_title_Data_Manager,
        "job_title_Data Scientist": features.job_title_Data_Scientist,
        "job_title_Machine Learning Engineer": features.job_title_Machine_Learning_Engineer
    }])

    # Predict using the loaded model
    prediction = model.predict(input_data)[0]

    return {
        "Salary (USD)": prediction
    }
import nest_asyncio

nest_asyncio.apply()

# Run the app
uvicorn.run(app, host="0.0.0.0", port=8001)

INFO:     Started server process [16608]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:8001 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [16608]
