In [1]:
#Import dependencies

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np #very efficient array and linear algebra functions
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression #Scikit-learn machine learning library for Python
from sklearn import ensemble
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import statsmodels.api as sm

## Read the CSV and Perform Basic Data Cleaning

### Target variable is starting salary.  Goal of multiple linear regression model is to predict a person's starting salary based on school type and location

Potential Features = X = School Name, School Type, Mid-Career Median Salary, Mid-Career 10th Percentile Salary, Mid-Career 25th Percentil Salary, Mid-Career 75th Percentile Salary, Mid-Career 90th Percentile Salary

Value to Predict = Y = Starting Median Salary

In [2]:
#Load the CSV file as a Pandas DataFrame and preview the DataFrame
df = pd.read_csv("salaries-by-college-type.csv")
df.head()

Unnamed: 0,School Name,School Type,Starting Median Salary,Mid-Career Median Salary,Mid-Career 10th Percentile Salary,Mid-Career 25th Percentile Salary,Mid-Career 75th Percentile Salary,Mid-Career 90th Percentile Salary
0,Massachusetts Institute of Technology (MIT),Engineering,"$72,200.00","$126,000.00","$76,800.00","$99,200.00","$168,000.00","$220,000.00"
1,California Institute of Technology (CIT),Engineering,"$75,500.00","$123,000.00",,"$104,000.00","$161,000.00",
2,Harvey Mudd College,Engineering,"$71,800.00","$122,000.00",,"$96,000.00","$180,000.00",
3,"Polytechnic University of New York, Brooklyn",Engineering,"$62,400.00","$114,000.00","$66,800.00","$94,300.00","$143,000.00","$190,000.00"
4,Cooper Union,Engineering,"$62,200.00","$114,000.00",,"$80,200.00","$142,000.00",


In [3]:
#Remove the fields from the data set that we don't want to include in our model
del df['School Name']
del df['Mid-Career Median Salary']
del df['Mid-Career 10th Percentile Salary']
del df['Mid-Career 25th Percentile Salary']
del df['Mid-Career 75th Percentile Salary']
del df['Mid-Career 90th Percentile Salary']

In [4]:
df.head()

Unnamed: 0,School Type,Starting Median Salary
0,Engineering,"$72,200.00"
1,Engineering,"$75,500.00"
2,Engineering,"$71,800.00"
3,Engineering,"$62,400.00"
4,Engineering,"$62,200.00"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   School Type             269 non-null    object
 1   Starting Median Salary  269 non-null    object
dtypes: object(2)
memory usage: 4.3+ KB


In [6]:
df['Starting Median Salary']=(df['Starting Median Salary'].replace( '[\$,)]','', regex=True )
                   .replace( '[(]','-',   regex=True ).astype(float))

In [7]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 269 entries, 0 to 268
Data columns (total 2 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   School Type             269 non-null    object 
 1   Starting Median Salary  269 non-null    float64
dtypes: float64(1), object(1)
memory usage: 4.3+ KB


In [8]:
df.head()

Unnamed: 0,School Type,Starting Median Salary
0,Engineering,72200.0
1,Engineering,75500.0
2,Engineering,71800.0
3,Engineering,62400.0
4,Engineering,62200.0


In [9]:
#Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index
df_cat

Index(['School Type'], dtype='object')

In [10]:
#Check out the number of unique values in each column
df[df_cat].nunique()

School Type    5
dtype: int64

In [11]:
#Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

#Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

#Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(df_cat)
encode_df.head()

Unnamed: 0,School Type_Engineering,School Type_Ivy League,School Type_Liberal Arts,School Type_Party,School Type_State
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0


In [12]:
#Merge one-hot encoded features and drop the originals
df = df.merge(encode_df, left_index=True, right_index=True)
df = df.drop(df_cat,1)
df.head()

Unnamed: 0,Starting Median Salary,School Type_Engineering,School Type_Ivy League,School Type_Liberal Arts,School Type_Party,School Type_State
0,72200.0,1.0,0.0,0.0,0.0,0.0
1,75500.0,1.0,0.0,0.0,0.0,0.0
2,71800.0,1.0,0.0,0.0,0.0,0.0
3,62400.0,1.0,0.0,0.0,0.0,0.0
4,62200.0,1.0,0.0,0.0,0.0,0.0


In [14]:
df.rename(columns = {'Starting Median Salary':'Starting_Median_Salary'}, inplace=True)

In [15]:
df.head()

Unnamed: 0,Starting_Median_Salary,School Type_Engineering,School Type_Ivy League,School Type_Liberal Arts,School Type_Party,School Type_State
0,72200.0,1.0,0.0,0.0,0.0,0.0
1,75500.0,1.0,0.0,0.0,0.0,0.0
2,71800.0,1.0,0.0,0.0,0.0,0.0
3,62400.0,1.0,0.0,0.0,0.0,0.0
4,62200.0,1.0,0.0,0.0,0.0,0.0


In [16]:
#Remove the target from features data
X = df.drop(columns=["Starting_Median_Salary"]).values
y = df.Starting_Median_Salary.values

In [17]:
df.Starting_Median_Salary.values

array([72200., 75500., 71800., 62400., 62200., 61000., 61800., 61100.,
       58300., 58100., 60600., 56000., 53000., 53500., 55800., 51000.,
       48900., 52700., 46200., 52900., 52000., 50500., 49700., 44500.,
       47100., 46900., 44100., 49900., 47400., 46300., 44700., 42600.,
       41300., 41400., 41800., 43100., 43800., 42200., 42100., 54100.,
       52800., 54500., 53900., 48100., 50200., 51900., 53600., 49700.,
       46100., 47500., 51700., 48600., 46500., 47300., 47200., 48600.,
       46000., 47700., 42400., 49100., 41400., 45300., 46400., 44700.,
       45500., 44000., 49200., 42600., 42800., 42000., 43400., 46600.,
       38500., 40500., 44500., 43500., 42100., 41800., 39200., 42600.,
       41600., 42500., 38900., 42000., 41500., 39500., 58000., 66500.,
       59100., 63400., 60900., 60300., 56200., 59400., 59900., 52700.,
       57200., 52600., 51100., 52300., 47100., 48300., 52900., 49700.,
       53600., 57100., 53500., 52000., 50500., 49700., 52700., 49500.,
      

In [18]:
#Split into a training set and a test set
from sklearn.model_selection import train_test_split

#training set will be the larger portion of the data, typically 70% or more
#after the split will have four sets of data: x_train, x_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [30]:
regressor = LinearRegression()
regressor.fit(X_train,y_train)

LinearRegression()

In [31]:
y_pred = regressor.predict(X_test)

In [32]:
plt.scatter(X_test['Starting_Median_Salary'], y_test, color='red')

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [19]:
#Create a StandardScalar instance
scaler = StandardScaler()

In [20]:
#Fit the StandardScaler
X_scaler= scaler.fit(X_train)

In [21]:
#Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [22]:
reg = LinearRegression()

In [23]:
reg.fit(X_train_scaled,y_train)

LinearRegression()

In [24]:
y_pred=reg.predict(X_train_scaled)
print(y_pred.shape)

(188,)


In [25]:
y_pred_test=reg.predict(X_test_scaled)
print(y_pred_test)

[44098.31932773 44098.31932773 45677.77777778 44098.31932773
 44098.31932773 44098.31932773 44098.31932773 44098.31932773
 60884.61538462 60884.61538462 44098.31932773 44098.31932773
 44098.31932773 44098.31932773 44098.31932773 44098.31932773
 44098.31932773 44098.31932773 60660.         45677.77777778
 44098.31932773 44098.31932773 45677.77777778 45677.77777778
 44098.31932773 45677.77777778 44098.31932773 45677.77777778
 44098.31932773 44098.31932773 60884.61538462 44098.31932773
 44098.31932773 44098.31932773 45677.77777778 44098.31932773
 45677.77777778 44098.31932773 45273.33333333 44098.31932773
 44098.31932773 44098.31932773 45273.33333333 44098.31932773
 44098.31932773 45273.33333333 44098.31932773 44098.31932773
 44098.31932773 44098.31932773 44098.31932773 60884.61538462
 45273.33333333 44098.31932773 44098.31932773 60884.61538462
 44098.31932773 44098.31932773 44098.31932773 60660.
 44098.31932773 45677.77777778 44098.31932773 44098.31932773
 45677.77777778 44098.31932773 4

In [33]:
print(X)

[[1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]]


In [29]:
#y_pred
plt.scatter(X_test_scaled ['Starting_Median_Salary'], y_pred_test, color='red')

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [87]:
print(reg.coef_)

[ 9565.80621992  9341.19083531 -5641.03138691 -6045.47583136
 -7220.48983696]


In [88]:
print(reg.intercept_)

51318.80916469152


In [101]:
#Find the error rate on the training set (mean absolute error is the measure of the average prediction error across the data set)
mse = mean_absolute_error(y_train, reg.predict(X_train))
mse

3503.0291985861445

In [102]:
#Find the error rate on the test set
msetest = mean_absolute_error(y_test, reg.predict(X_test))
msetest

3475.435973693054

In [111]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_pred, y_train,squared=False)

4553.663560453057

In [97]:
#test model after training by calling the predict method and giving it the x test variable as an arguement
reg.predict


<bound method LinearModel.predict of LinearRegression()>

In [99]:
#Assess the performance of the model with results

#from sklearn.metrics import accuracy_score
#print(accuracy_score(y_test,y_pred))

ValueError: endog and exog matrices are different sizes