# Prediction Model Environment/Template

### By Chuan Yong (Bill) Guo

The purpose of this file is to act as a template to test various machine learning models and document possible options and parameters that can be manipulated for each model in a concise and easy to follow format. Each line of code is attached to a '#' symbol in front of it to prevent the code from running as multiple codes within a code cell because some codes may not be allowed to run together. In order to run the line of code, remove the '#' symbol on front of the desired code line. The purpose of this setup package multiple concepts within a single code cell for easy access and understanding, without having the cell run into errors trying to run multiple contradictory codes.

In [1]:
#This cell is used to import modules that will be useful for model prediction. 

import pandas as pd #Pandas is the base Python library used to manuipulate datasets, typically this will always be useful. 
import numpy as np #Numpy is mainly used to allow for mathematical manipulations of datasets, typically this will always be useful.


In [7]:
#Importing datasets
#df_original = pd.read_csv('Datasets/World Cities Database Population (OCT-2022).csv') #Used to read .csv files.
#df = df_original #Good idea to create a new object that is a copy of the original dataset so that the original dataset isn't directly manipulated
df2 = pd.read_excel('Datasets/US_Population_2020_2021.xlsx', sheet_name='2021') #Used to read .xls files, sheet_name can be used to select for different sheets by using index number or string name.
#df_original.head() #Initial dataset without any manipulation.
df2

Unnamed: 0,Geographic Area,Population
0,"New York city, New York",8467513
1,"Los Angeles city, California",3849297
2,"Chicago city, Illinois",2696555
3,"Houston city, Texas",2288250
4,"Phoenix city, Arizona",1624569
...,...,...
799,"Cypress city, California",49926
800,"Murray city, Utah",49729
801,"Chesterfield city, Missouri",49703
802,"Downers Grove village, Illinois",49654


### Basic Column and Row Manipulation

In [46]:
###'''Data Cleaning'''

###'''Declaring and resetting columns as index'''
#df.set_index('Job Title',inplace=True)
#df.reset_index(inplace=True)

###'''Adding a column based on other columns'''
#df['New column'] = (df['Lower Salary'] + df['Upper Salary'])/2

###'''Removing columns'''
#df.drop(columns={'Rating','Location'},inplace=True) #By column name.
#df.drop(df.iloc[:, 1:3],axis=1,inplace=True) #By column index with range of columns (axis=1 indicates columns)

###'''Viewing multiple columns'''
#df[['New column','Job Title']] #By column name
#df.loc[:,'Rating':'Location'] #By column name(another format)
#df.iloc[:,[1,2,5]] #By column index
#df.iloc[:,1:5] #By column index with range of columns

###'''Adding a row'''
#df.loc[len(df)] = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41] #Add new row at the end of dataframe, or any row by specifying index number, number of elements must match column number

###'''Removing rows'''
#df.drop([0,1])

###'''Viewing multiple rows'''
#df.iloc[3:9,1:5]



### Data values/strings Manipulation

In [58]:
###'''Visualizing datatypes and null values each column contains'''
df.info()
df.columns
###'''Imputation (Replace null data with something or removing it completely)'''


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 224 entries, 0 to 223
Data columns (total 56 columns):
 #   Column                                                                    Non-Null Count  Dtype  
---  ------                                                                    --------------  -----  
 0   Unnamed: 0                                                                224 non-null    int64  
 1   City                                                                      224 non-null    object 
 2   Cost_index                                                                224 non-null    float64
 3   Meal, Inexpensive Restaurant                                              224 non-null    float64
 4   Meal for 2 People, Mid-range Restaurant, Three-course                     224 non-null    float64
 5   McMeal at McDonalds (or Equivalent Combo Meal)                            224 non-null    float64
 6   Domestic Beer (0.5 liter draught)                                 

Index(['Unnamed: 0', 'City', 'Cost_index', 'Meal, Inexpensive Restaurant',
       'Meal for 2 People, Mid-range Restaurant, Three-course',
       'McMeal at McDonalds (or Equivalent Combo Meal)',
       'Domestic Beer (0.5 liter draught)',
       'Imported Beer (0.33 liter bottle)', 'Cappuccino (regular)',
       'Coke/Pepsi (0.33 liter bottle)', 'Milk (regular), (1 liter)',
       'Loaf of Fresh White Bread (500g)', 'Rice (white), (1kg)',
       'Eggs (regular) (12)', 'Local Cheese (1kg)', 'Chicken Fillets (1kg)',
       'Beef Round (1kg) (or Equivalent Back Leg Red Meat)', 'Apples (1kg)',
       'Banana (1kg)', 'Oranges (1kg)', 'Tomato (1kg)', 'Potato (1kg)',
       'Onion (1kg)', 'Lettuce (1 head)', 'Water (1.5 liter bottle)',
       'Bottle of Wine (Mid-Range)', 'Domestic Beer (0.5 liter bottle)',
       'Imported Beer (0.33 liter bottle).1', 'Cigarettes 20 Pack (Marlboro)',
       'One-way Ticket (Local Transport)', 'Monthly Pass (Regular Price)',
       'Taxi Start (Normal Tariff

### Splitting data for testing and training, test models

In [38]:
df = df_original
df = df[['Job Title', 'Lower Salary','Location']]
df = pd.get_dummies(df)
df

Unnamed: 0,Lower Salary,Job Title_Ag Data Scientist,Job Title_Analytics - Business Assurance Data Analyst,Job Title_Analytics Consultant,Job Title_Analytics Manager,Job Title_Analytics Manager - Data Mart,"Job Title_Assistant Director/Director, Office of Data Science",Job Title_Associate Data Analyst,Job Title_Associate Data Analyst- Graduate Development Program,Job Title_Associate Data Engineer,...,"Location_Washington, DC","Location_Watertown, MA","Location_West Palm Beach, FL","Location_West Reading, PA","Location_Westlake, OH","Location_Winston-Salem, NC","Location_Winter Park, FL","Location_Woburn, MA","Location_Woodbridge, NJ","Location_Worcester, MA"
0,53,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,63,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,80,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,56,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,86,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
737,58,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
738,72,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
739,56,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
740,95,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
from sklearn.model_selection import train_test_split
y = np.array(df['Lower Salary'])
x = np.array(df.drop(columns={'Lower Salary'}))
xTrain, xTest, yTrain, yTest = train_test_split(x, y)

In [43]:
from sklearn.linear_model import LinearRegression
model1 = LinearRegression().fit(xTrain,yTrain)
from sklearn.metrics import mean_squared_error
model1.predict(xTest)

array([ 5.66406250e+01,  5.87968750e+01,  1.01515625e+02,  4.57968750e+01,
       -7.60586793e+14, -4.32640830e+14, -5.80020272e+14,  8.28593750e+01,
        1.82490803e+14,  6.23750000e+01,  6.15000000e+01,  4.22968750e+01,
        9.01718750e+01,  5.08906250e+01,  7.97656250e+01, -2.17471498e+13,
        9.14314064e+14,  1.44468750e+02,  1.19140625e+02,  5.30468750e+01,
        3.99218750e+01,  7.34531250e+01, -7.78131563e+13, -9.02634582e+14,
        7.29531250e+01, -1.34943444e+15,  1.85156250e+01,  6.05109198e+14,
       -8.97136647e+13,  5.03125000e+01,  1.42250000e+02, -2.43836179e+13,
        9.16562500e+01,  1.26843750e+02,  1.04703125e+02,  9.95468750e+01,
       -1.99033205e+14, -2.77401919e+14,  5.13906250e+01,  6.78593750e+01,
        5.64531250e+01,  1.07515625e+02, -9.84861424e+14,  3.99350348e+14,
        1.01515625e+02,  4.03906250e+01,  9.08593750e+01,  7.29817789e+13,
        8.05468750e+01,  5.53593750e+01,  1.00578125e+02,  6.72963773e+14,
        6.87968750e+01,  