In [13]:
#This cell is used to import modules that will be useful for model prediction. 

import pandas as pd #Pandas is the base Python library used to manuipulate datasets, typically this will always be useful. 
import numpy as np #Numpy is mainly used to allow for mathematical manipulations of datasets, typically this will always be useful.
import seaborn as sns



In [14]:
#Importing datasets
df1 = pd.read_csv('Datasets/salarysample.csv', usecols=['Job Title','Avg Salary(K)','Job Description','Location']) #Used to read .csv files.
df2=pd.read_excel('Datasets/US_Population_2020_2021.xlsx', sheet_name='2021') #Used to read .xls files, sheet_name can be used to select for different sheets by using index number or string name.

df1 #Initial dataset without any manipulation.

Unnamed: 0,Job Title,Job Description,Location,Avg Salary(K)
0,Data Scientist,"Data Scientist\nLocation: Albuquerque, NM\nEdu...","Albuquerque, NM",72.0
1,Healthcare Data Scientist,What You Will Do:\n\nI. General Summary\n\nThe...,"Linthicum, MD",87.5
2,Data Scientist,"KnowBe4, Inc. is a high growth information sec...","Clearwater, FL",85.0
3,Data Scientist,*Organization and Job ID**\nJob ID: 310709\n\n...,"Richland, WA",76.5
4,Data Scientist,Data Scientist\nAffinity Solutions / Marketing...,"New York, NY",114.5
...,...,...,...,...
737,"Sr Scientist, Immuno-Oncology - Oncology",Site Name: USA - Massachusetts - Cambridge\nPo...,"Cambridge, MA",84.5
738,Senior Data Engineer,THE CHALLENGE\nEventbrite has a world-class da...,"Nashville, TN",102.5
739,"Project Scientist - Auton Lab, Robotics Institute",The Auton Lab at Carnegie Mellon University is...,"Pittsburgh, PA",73.5
740,Data Science Manager,Data Science ManagerResponsibilities:\n\nOvers...,"Allentown, PA",127.5


In [15]:
df1['Location'] = df1['Location'].apply(lambda comma: comma.split(',')[0])
df1

Unnamed: 0,Job Title,Job Description,Location,Avg Salary(K)
0,Data Scientist,"Data Scientist\nLocation: Albuquerque, NM\nEdu...",Albuquerque,72.0
1,Healthcare Data Scientist,What You Will Do:\n\nI. General Summary\n\nThe...,Linthicum,87.5
2,Data Scientist,"KnowBe4, Inc. is a high growth information sec...",Clearwater,85.0
3,Data Scientist,*Organization and Job ID**\nJob ID: 310709\n\n...,Richland,76.5
4,Data Scientist,Data Scientist\nAffinity Solutions / Marketing...,New York,114.5
...,...,...,...,...
737,"Sr Scientist, Immuno-Oncology - Oncology",Site Name: USA - Massachusetts - Cambridge\nPo...,Cambridge,84.5
738,Senior Data Engineer,THE CHALLENGE\nEventbrite has a world-class da...,Nashville,102.5
739,"Project Scientist - Auton Lab, Robotics Institute",The Auton Lab at Carnegie Mellon University is...,Pittsburgh,73.5
740,Data Science Manager,Data Science ManagerResponsibilities:\n\nOvers...,Allentown,127.5


In [16]:
df1['Location'] = df1['Location'].apply(lambda comma: comma.split(',')[0])

df2['Geographic Area']=df2['Geographic Area'].apply(lambda symbol : symbol.replace(' city',''))
df2['Geographic Area'] = df2['Geographic Area'].apply(lambda comma: comma.split(',')[0])
df2['Location']=df2['Geographic Area']
df3=pd.merge(df1, df2, left_on='Location', right_on='Geographic Area', how='left')
df3['Salary']=df3['Avg Salary(K)']*1000

def categorized_title(job_title):
    if 'Scientist' in job_title:
        return 'Data Scientist'
    elif 'Engineer' in job_title:
        return 'Data Engineer'
    elif 'Analyst' in job_title:
        return 'Data Analyst'
    else:
        return 'Other'
df3['Categorized Job Title'] = df3['Job Title'].apply(categorized_title)



df3.drop(columns={'Geographic Area','Avg Salary(K)','Job Description','Job Title','Location_x','Location_y'},inplace=True)
df3['Population'] = df3['Population'].fillna(df3['Population'].mean())
df3

Unnamed: 0,Population,Salary,Categorized Job Title
0,5.625990e+05,72000.0,Data Scientist
1,1.190702e+06,87500.0,Data Scientist
2,1.166740e+05,85000.0,Data Scientist
3,6.192900e+04,76500.0,Data Scientist
4,8.467513e+06,114500.0,Data Scientist
...,...,...,...
804,1.170900e+05,84500.0,Data Scientist
805,1.190702e+06,102500.0,Data Engineer
806,3.004310e+05,73500.0,Data Scientist
807,1.259440e+05,127500.0,Other


In [17]:
df3 = pd.get_dummies(df3)
df3

Unnamed: 0,Population,Salary,Categorized Job Title_Data Analyst,Categorized Job Title_Data Engineer,Categorized Job Title_Data Scientist,Categorized Job Title_Other
0,5.625990e+05,72000.0,0,0,1,0
1,1.190702e+06,87500.0,0,0,1,0
2,1.166740e+05,85000.0,0,0,1,0
3,6.192900e+04,76500.0,0,0,1,0
4,8.467513e+06,114500.0,0,0,1,0
...,...,...,...,...,...,...
804,1.170900e+05,84500.0,0,0,1,0
805,1.190702e+06,102500.0,0,1,0,0
806,3.004310e+05,73500.0,0,0,1,0
807,1.259440e+05,127500.0,0,0,0,1


In [18]:

from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
#model = RandomForestClassifier()
model =LinearRegression()
#model = RandomForestRegressor()

y=np.array(df3['Salary']).reshape(-1,1)
x=np.array(df3.drop(columns={"Salary"}))

xtrain, xtest, ytrain, ytest = train_test_split(x, y)
model.fit(xtrain, ytrain.ravel())
pred=model.predict(xtest)
print(mean_squared_error(pred, ytest, squared=False))



33001.41682283793


In [20]:
import warnings
warnings.filterwarnings("ignore")

data= {
          'Population':[df2.loc[df2['Location']=='Boston']['Population'].item()]
          }
user_df = pd.DataFrame(data)


user_df['Data Analyst'] = 0
user_df['Data Engineer'] = 0
user_df['Data Scientist'] = 1
user_df['Other'] = 0

user_data = user_df['Population'].values.reshape(-1, 1)

variable= model.predict(user_df)

user_df['Salary Prediction']=variable
user_df[['Population','Salary Prediction']]

Unnamed: 0,Population,Salary Prediction
0,654776,108907.270607
