# Predictive Modeling and Visualizations

 By Chuan Yong (Bill) Guo

The purpose of this file is to act as a template to test various machine learning models and document possible options and parameters that can be manipulated for each model in a concise and easy to follow format. Each line of code is attached to a '' symbol in front of it to prevent the code from running as multiple codes within a code cell because some codes may not be allowed to run together. In order to run the line of code, remove the '' symbol on front of the desired code line. The purpose of this setup package multiple concepts within a single code cell for easy access and understanding, without having the cell run into errors trying to run multiple contradictory codes.

In [1]:
#This cell is used to import modules that will be useful for model prediction. 

import pandas as pd #Pandas is the base Python library used to manuipulate datasets, typically this will always be useful. 
import numpy as np #Numpy is mainly used to allow for mathematical manipulations of datasets, typically this will always be useful.


In [27]:
#Importing datasets
df_original = pd.read_csv('Datasets/salarysample.csv', usecols=['Job Title','Avg Salary(K)','Job Description','Location']) #Used to read .csv files. usecols parameter selects for specific columns to be loaded. 
df_original2 = pd.read_excel('Datasets/US_Population_2020_2021.xlsx', sheet_name='2021', usecols=['Geographic Area','Population']) #Used to read .xls files, sheet_name can be used to select for different sheets by using index number or string name.
df1 = df_original #Good idea to create a new object that is a copy of the original dataset so that the original dataset isn't directly manipulated
df2 = df_original2
#df_original.head() #Initial dataset without any manipulation.
df2

Unnamed: 0,Geographic Area,Population
0,"New York city, New York",8467513
1,"Los Angeles city, California",3849297
2,"Chicago city, Illinois",2696555
3,"Houston city, Texas",2288250
4,"Phoenix city, Arizona",1624569
...,...,...
799,"Cypress city, California",49926
800,"Murray city, Utah",49729
801,"Chesterfield city, Missouri",49703
802,"Downers Grove village, Illinois",49654


### Basic Column and Row Manipulation

In [28]:
###'''Data Cleaning'''

###'''Declaring and resetting columns as index'''
#df.set_index('Job Title',inplace=True)
#df.reset_index(inplace=True)

###'''Adding a column based on other columns'''
#df['New column'] = (df['Lower Salary'] + df['Upper Salary'])/2 #Mathematical manipulation

#Method for adding a column based on condition of existing column, can edit conditions to suit needs.
    #def output_element(input_element): 
        #if input_element < 50:
            #return 'Low'
        #elif input_element >= 50 and input_element < 80:
            #return 'Medium'
        #else:
            #return 'High'
    #df['New Column'] = df['Lower Salary'].apply(output_element)
    #df

###'''Removing columns'''
#df.drop(columns={'Rating','Location'},inplace=True) #By column name.
#df.drop(df.iloc[:, 1:3],axis=1,inplace=True) #By column index with range of columns (axis=1 indicates columns)

###'''Viewing multiple columns'''
#df[['New column','Job Title']] #By column name
#df.loc[:,'Rating':'Location'] #By column name(another format)
#df.iloc[:,[1,2,5]] #By column index
#df.iloc[:,1:5] #By column index with range of columns

###'''Adding a row'''
#df.loc[len(df)] = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41] #Add new row at the end of dataframe, or any row by specifying index number, number of elements must match column number

###'''Removing rows'''
#df.drop([0,1])

###'''Viewing multiple rows'''
#df.iloc[3:9,1:5]



### Merging Multiple Datasets

In [29]:
###'''Merging 2 datasets'''
df1['Location'] = df1['Location'].apply(lambda comma: comma.split(',')[0])

df2['Geographic Area']=df2['Geographic Area'].apply(lambda symbol : symbol.replace(' city',''))
df2['Geographic Area'] = df2['Geographic Area'].apply(lambda comma: comma.split(',')[0])
df2['Location']=df2['Geographic Area']
df3=pd.merge(df1, df2, left_on='Location', right_on='Geographic Area', how='left')
df3['Salary']=df3['Avg Salary(K)']*1000

def categorized_title(job_title):
    if 'Scientist' in job_title:
        return 'Data Scientist'
    elif 'Engineer' in job_title:
        return 'Data Engineer'
    elif 'Analyst' in job_title:
        return 'Data Analyst'
    else:
        return 'Other'
df3['Categorized Job Title'] = df3['Job Title'].apply(categorized_title)



df3.drop(columns={'Geographic Area','Avg Salary(K)','Job Description','Job Title','Location_x','Location_y'},inplace=True)
df3['Population'] = df3['Population'].fillna(df3['Population'].mean())
df3

Unnamed: 0,Population,Salary,Categorized Job Title
0,5.625990e+05,72000.0,Data Scientist
1,1.190702e+06,87500.0,Data Scientist
2,1.166740e+05,85000.0,Data Scientist
3,6.192900e+04,76500.0,Data Scientist
4,8.467513e+06,114500.0,Data Scientist
...,...,...,...
804,1.170900e+05,84500.0,Data Scientist
805,1.190702e+06,102500.0,Data Engineer
806,3.004310e+05,73500.0,Data Scientist
807,1.259440e+05,127500.0,Other


### Data Values Manipulation

In [32]:
###'''Visualizing datatypes and null values each column contains'''
df3.info()
#df.columns
###'''Imputation (Replace null data with something or removing it completely)'''


<class 'pandas.core.frame.DataFrame'>
Int64Index: 809 entries, 0 to 808
Data columns (total 3 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Population             809 non-null    float64
 1   Salary                 809 non-null    float64
 2   Categorized Job Title  809 non-null    object 
dtypes: float64(2), object(1)
memory usage: 25.3+ KB


### Splitting Data For Testing and Training

In [33]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

df3 = pd.get_dummies(df3)

#model = RandomForestClassifier()
model =LinearRegression()
#model = RandomForestRegressor()

y=np.array(df3['Salary']).reshape(-1,1)
x=np.array(df3.drop(columns={"Salary"}))

xtrain, xtest, ytrain, ytest = train_test_split(x, y)
model.fit(xtrain, ytrain.ravel())
pred=model.predict(xtest)
print(mean_squared_error(pred, ytest, squared=False))


33776.00319749582


In [34]:
data= {
          'Population':[df2.loc[df2['Location']=='Boston']['Population'].item()]
          }
user_df = pd.DataFrame(data)


user_df['Data Analyst'] = 0
user_df['Data Engineer'] = 0
user_df['Data Scientist'] = 1
user_df['Other'] = 0

user_data = user_df['Population'].values.reshape(-1, 1)

variable= model.predict(user_df)

user_df['Salary Prediction']=variable
user_df[['Population','Salary Prediction']]



Unnamed: 0,Population,Salary Prediction
0,654776,108255.809117
