# Predictive Modeling and Visualizations

 By Chuan Yong (Bill) Guo

The purpose of this file is to act as a template to test various machine learning models and document possible options and parameters that can be manipulated for each model in a concise and easy to follow format. Each line of code is attached to a '' symbol in front of it to prevent the code from running as multiple codes within a code cell because some codes may not be allowed to run together. In order to run the line of code, remove the '' symbol on front of the desired code line. The purpose of this setup package multiple concepts within a single code cell for easy access and understanding, without having the cell run into errors trying to run multiple contradictory codes.

In [1]:
#This cell is used to import modules that will be useful for model prediction. 

import pandas as pd #Pandas is the base Python library used to manuipulate datasets, typically this will always be useful. 
import numpy as np #Numpy is mainly used to allow for mathematical manipulations of datasets, typically this will always be useful.


In [2]:
#Importing datasets
df_original = pd.read_csv('Datasets/salarysample.csv', usecols=['Job Title','Salary Estimate','Lower Salary','Location']) #Used to read .csv files. usecols parameter selects for specific columns to be loaded. 
df_original2 = pd.read_excel('Datasets/US_Population_2020_2021.xlsx', sheet_name='2021', usecols=['Geographic Area','Population']) #Used to read .xls files, sheet_name can be used to select for different sheets by using index number or string name.
df = df_original #Good idea to create a new object that is a copy of the original dataset so that the original dataset isn't directly manipulated
df2 = df_original2
#df_original.head() #Initial dataset without any manipulation.
df

Unnamed: 0,Job Title,Salary Estimate,Location,Lower Salary
0,Data Scientist,$53K-$91K (Glassdoor est.),"Albuquerque, NM",53
1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),"Linthicum, MD",63
2,Data Scientist,$80K-$90K (Glassdoor est.),"Clearwater, FL",80
3,Data Scientist,$56K-$97K (Glassdoor est.),"Richland, WA",56
4,Data Scientist,$86K-$143K (Glassdoor est.),"New York, NY",86
...,...,...,...,...
737,"Sr Scientist, Immuno-Oncology - Oncology",$58K-$111K (Glassdoor est.),"Cambridge, MA",58
738,Senior Data Engineer,$72K-$133K (Glassdoor est.),"Nashville, TN",72
739,"Project Scientist - Auton Lab, Robotics Institute",$56K-$91K (Glassdoor est.),"Pittsburgh, PA",56
740,Data Science Manager,$95K-$160K (Glassdoor est.),"Allentown, PA",95


### Basic Column and Row Manipulation

In [3]:
###'''Data Cleaning'''

###'''Declaring and resetting columns as index'''
#df.set_index('Job Title',inplace=True)
#df.reset_index(inplace=True)

###'''Adding a column based on other columns'''
#df['New column'] = (df['Lower Salary'] + df['Upper Salary'])/2 #Mathematical manipulation

#Method format, can edit conditions to suit needs.
    #def output_element(input_element): 
        #if input_element < 50:
            #return 'Low'
        #elif input_element >= 50 and input_element < 80:
            #return 'Medium'
        #else:
            #return 'High'
    #df['New Column'] = df['Lower Salary'].apply(output_element)
    #df

###'''Removing columns'''
#df.drop(columns={'Rating','Location'},inplace=True) #By column name.
#df.drop(df.iloc[:, 1:3],axis=1,inplace=True) #By column index with range of columns (axis=1 indicates columns)

###'''Viewing multiple columns'''
#df[['New column','Job Title']] #By column name
#df.loc[:,'Rating':'Location'] #By column name(another format)
#df.iloc[:,[1,2,5]] #By column index
#df.iloc[:,1:5] #By column index with range of columns

###'''Adding a row'''
#df.loc[len(df)] = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41] #Add new row at the end of dataframe, or any row by specifying index number, number of elements must match column number

###'''Removing rows'''
#df.drop([0,1])

###'''Viewing multiple rows'''
#df.iloc[3:9,1:5]



### Data values/strings Manipulation

In [4]:
###'''Visualizing datatypes and null values each column contains'''
df.info()
df.columns
###'''Imputation (Replace null data with something or removing it completely)'''


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 742 entries, 0 to 741
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Job Title        742 non-null    object
 1   Salary Estimate  742 non-null    object
 2   Location         742 non-null    object
 3   Lower Salary     742 non-null    int64 
dtypes: int64(1), object(3)
memory usage: 23.3+ KB


Index(['Job Title', 'Salary Estimate', 'Location', 'Lower Salary'], dtype='object')

### Merging Multiple Datasets

In [5]:

df['Location'] = df['Location'].apply(lambda comma: comma.split(',')[0])

df2['Geographic Area']=df2['Geographic Area'].apply(lambda symbol : symbol.replace(' city',''))
df2['Geographic Area'] = df2['Geographic Area'].apply(lambda comma: comma.split(',')[0])
df3=pd.merge(df, df2, left_on='Location', right_on='Geographic Area',how='left')
df3.drop(columns={'Job Title','Location','Geographic Area'},inplace=True)
df3['Population'].fillna(df3['Population'].mean(),inplace=True)
df3

Unnamed: 0,Salary Estimate,Lower Salary,Population
0,$53K-$91K (Glassdoor est.),53,5.625990e+05
1,$63K-$112K (Glassdoor est.),63,1.190702e+06
2,$80K-$90K (Glassdoor est.),80,1.166740e+05
3,$56K-$97K (Glassdoor est.),56,6.192900e+04
4,$86K-$143K (Glassdoor est.),86,8.467513e+06
...,...,...,...
804,$58K-$111K (Glassdoor est.),58,1.170900e+05
805,$72K-$133K (Glassdoor est.),72,1.190702e+06
806,$56K-$91K (Glassdoor est.),56,3.004310e+05
807,$95K-$160K (Glassdoor est.),95,1.259440e+05


### Splitting data for testing and training, test models

In [6]:
df = df_original
df = df[['Job Title', 'Lower Salary','Location']]
df = pd.get_dummies(df)
df

Unnamed: 0,Lower Salary,Job Title_Ag Data Scientist,Job Title_Analytics - Business Assurance Data Analyst,Job Title_Analytics Consultant,Job Title_Analytics Manager,Job Title_Analytics Manager - Data Mart,"Job Title_Assistant Director/Director, Office of Data Science",Job Title_Associate Data Analyst,Job Title_Associate Data Analyst- Graduate Development Program,Job Title_Associate Data Engineer,...,Location_Washington,Location_Watertown,Location_West Palm Beach,Location_West Reading,Location_Westlake,Location_Winston-Salem,Location_Winter Park,Location_Woburn,Location_Woodbridge,Location_Worcester
0,53,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,80,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,56,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,86,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,58,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
738,72,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
739,56,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
740,95,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
from sklearn.model_selection import train_test_split
y = np.array(df['Lower Salary'])
x = np.array(df.drop(columns={'Lower Salary'}))
xTrain, xTest, yTrain, yTest = train_test_split(x, y)

In [8]:
from sklearn.linear_model import LinearRegression
model1 = LinearRegression().fit(xTrain,yTrain)
from sklearn.metrics import mean_squared_error
model1.predict(xTest)

array([-3.17730865e+14,  5.45000000e+01,  1.00093750e+02,  7.62187500e+01,
        8.54375000e+01,  5.15000000e+01,  7.03125000e+01,  8.13125000e+01,
        6.42812500e+01,  8.46875000e+01,  4.83474163e+14,  6.99062500e+01,
        1.31812500e+02,  3.64748961e+14,  1.06531250e+02,  4.22187500e+01,
       -5.01982560e+14,  7.20308591e+14,  3.64062500e+01,  5.94062500e+01,
        2.35341857e+14, -1.25027033e+14,  5.53437500e+01,  2.58750000e+01,
        3.62187500e+01,  5.94062500e+01,  3.18247546e+14,  8.50937500e+01,
        1.29000000e+02, -2.90613122e+14,  4.84062500e+01, -1.25518781e+15,
       -3.23448874e+14, -1.24078631e+15, -4.36737194e+13, -3.16251317e+14,
        8.38437500e+01,  5.90937500e+01,  7.26250000e+01,  5.74375000e+01,
       -2.03356783e+13,  5.86104501e+13,  2.02968750e+02,  4.21875000e+01,
        3.14830941e+14, -2.65340981e+14,  1.23812500e+02,  1.00968750e+02,
        6.16250000e+01,  6.42812500e+01,  1.15093750e+02,  4.51875000e+01,
        1.12125000e+02,  