In [2]:
#Import dependencies

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np #very efficient array and linear algebra functions
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression #Scikit-learn machine learning library for Python
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder

## Read the CSV and Perform Basic Data Cleaning

### Target variable is starting salary.  Goal of multiple linear regression model is to predict a person's starting salary based on school type and location

Potential Features = X = School Name, School Type, Mid-Career Median Salary, Mid-Career 10th Percentile Salary, Mid-Career 25th Percentil Salary, Mid-Career 75th Percentile Salary, Mid-Career 90th Percentile Salary

Value to Predict = Y = Starting Median Salary

In [3]:
#Load the CSV file as a Pandas DataFrame and preview the DataFrame
df = pd.read_csv("salaries-by-college-type.csv")
df.head()

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,Massachusetts Institute of Technology (MIT),Engineering,"$72,200.00","$126,000.00","$76,800.00","$99,200.00","$168,000.00","$220,000.00"
1,California Institute of Technology (CIT),Engineering,"$75,500.00","$123,000.00",,"$104,000.00","$161,000.00",
2,Harvey Mudd College,Engineering,"$71,800.00","$122,000.00",,"$96,000.00","$180,000.00",
3,"Polytechnic University of New York, Brooklyn",Engineering,"$62,400.00","$114,000.00","$66,800.00","$94,300.00","$143,000.00","$190,000.00"
4,Cooper Union,Engineering,"$62,200.00","$114,000.00",,"$80,200.00","$142,000.00",


In [4]:
#Remove the fields from the data set that we don't want to include in our model
del df['School Name']
del df['Mid-Career Median Salary']
del df['Mid-Career 10th Percentile Salary']
del df['Mid-Career 25th Percentile Salary']
del df['Mid-Career 75th Percentile Salary']
del df['Mid-Career 90th Percentile Salary']

In [6]:
df.head()

Unnamed: 0,School Type,Starting Median Salary
0,Engineering,"$72,200.00"
1,Engineering,"$75,500.00"
2,Engineering,"$71,800.00"
3,Engineering,"$62,400.00"
4,Engineering,"$62,200.00"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   School Type             269 non-null    object
 1   Starting Median Salary  269 non-null    object
dtypes: object(2)
memory usage: 4.3+ KB


In [7]:
df['Starting Median Salary']=(df['Starting Median Salary'].replace( '[\$,)]','', regex=True )
                   .replace( '[(]','-',   regex=True ).astype(float))

In [8]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   School Type             269 non-null    object 
 1   Starting Median Salary  269 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.3+ KB


In [23]:
df.head()

Unnamed: 0,School Type,Starting Median Salary
0,Engineering,72200.0
1,Engineering,75500.0
2,Engineering,71800.0
3,Engineering,62400.0
4,Engineering,62200.0


In [24]:
#Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index
df_cat

Index(['School Type'], dtype='object')

In [25]:
#Check out the number of unique values in each column
df[df_cat].nunique()

School Type    5
dtype: int64

In [26]:
#Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

#Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

#Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(df_cat)
encode_df.head()

Unnamed: 0,School Type_Engineering,School Type_Ivy League,School Type_Liberal Arts,School Type_Party,School Type_State
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0


In [27]:
#Merge one-hot encoded features and drop the originals
df = df.merge(encode_df, left_index=True, right_index=True)
df = df.drop(df_cat,1)
df.head()

Unnamed: 0,Starting Median Salary,School Type_Engineering,School Type_Ivy League,School Type_Liberal Arts,School Type_Party,School Type_State
0,72200.0,1.0,0.0,0.0,0.0,0.0
1,75500.0,1.0,0.0,0.0,0.0,0.0
2,71800.0,1.0,0.0,0.0,0.0,0.0
3,62400.0,1.0,0.0,0.0,0.0,0.0
4,62200.0,1.0,0.0,0.0,0.0,0.0


In [28]:
#Replace the categorical data with one-hot encoded data
#features_df = pd.get_dummies(df, columns=['School Type'])
#features_df

In [35]:
df.rename(columns = {'Starting Median Salary':'Starting_Median_Salary'}, inplace=True)

In [36]:
df.head()

Unnamed: 0,Starting_Median_Salary,School Type_Engineering,School Type_Ivy League,School Type_Liberal Arts,School Type_Party,School Type_State
0,72200.0,1.0,0.0,0.0,0.0,0.0
1,75500.0,1.0,0.0,0.0,0.0,0.0
2,71800.0,1.0,0.0,0.0,0.0,0.0
3,62400.0,1.0,0.0,0.0,0.0,0.0
4,62200.0,1.0,0.0,0.0,0.0,0.0


In [29]:
#df.dtypes

In [30]:
#features_df.dtypes

In [191]:
#plt.scatter(df.SchoolType, df.StartingMedianSalary)

In [19]:
#features_df['Starting Median Salary'] = pd.to_numeric(df['Starting Median Salary'], errors='coerce')
#features_df.dtypes

Starting Median Salary      float64
School Type_Engineering       uint8
School Type_Ivy League        uint8
School Type_Liberal Arts      uint8
School Type_Party             uint8
School Type_State             uint8
dtype: object

In [38]:
#Remove the target from features data
X = df.drop(columns=["Starting_Median_Salary"]).values
y = df.Starting_Median_Salary.values

In [46]:
#Split into a training set and a test set
from sklearn.model_selection import train_test_split

#training set will be the larger portion of the data, typically 70% or more
#after the split will have four sets of data: x_train, x_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [47]:
reg = LinearRegression()

In [48]:
reg.fit(X_train,y_train)

LinearRegression()

In [45]:
#After splitting the data will initialize and train the model using the training sets x_train and y _train
#gradientboostingregressor builds models for value prediction
#hyperperimeters
#n_estimators tells the model how many decision trees to build
#learning rate controls how much each additional decision tree influences the overall prediction, lower rates usually
#lead to a high accuracy, but only work if you have n_estimators set to a high value
#max_depth controls how many layers deep each individual decision tree can be
#min_samples_leaf controls how many times a value must appear in our training set for the decision tree to make a
#decision based on it
#max_features is the % of features in our model that we randomly use to consider each time we create a branch in our 
#decision tree
#loss controls how scikit learn calculates the error rate (or cost) as it learns
#model = ensemble.GradientBoostingRegressor(
    #n_estimators=150,
    #learning_rate=0.5,
    #max_depth=6,
    #min_samples_leaf=9,
    #max_features=0.1,
    #loss='huber'
#)

#model.fit(X_train,y_train)

GradientBoostingRegressor(learning_rate=0.5, loss='huber', max_depth=6,
                          max_features=0.1, min_samples_leaf=9,
                          n_estimators=150)

In [None]:
#Find the error rate on the training set (mean absolute error is the measure of the average prediction error across the data set)
#mse = mean_absolute_error(y_train, model.predict(X_train))
#print("Training Set Mean Absolute Error: %.4f" %mse)

In [None]:
#Find the error rate on the test set
#mse = mean_absolute_error(y_test, model.predict(X_test))
#print("Test Set Mean Absolute Error: %.4f" %mse)

In [None]:
#test model after training by calling the predict method and giving it the x test variable as an arguement
#model.predict


In [None]:
#Assess the performance of the model with results

#from sklearn.metrics import accuracy_score
#print(accuracy_score(y_test,y_pred))