# DTSC 2031 (Spring 2024) Midterm Exam - Regression

1) Go through the code examples below as a reference. The dataset `tips` is used to build three different regression models (linear regression, KNN, random forest). 

2) Use the `diamonds` dataset available in the seaborn library to train a regression algorithm of your choice. Make sure to perform all the preprocessing steps. Use the regression model to predict the price of a diamond with the following characteristics:

* Carat: 0.25	
* cut: Premium
* color: J
* depth: 56.7
* table: 57.0
* x: 4.01
* y: 4.05
* z: 2.35


## Prepare the Data for Regression Problems

In [1]:
# importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns

#get dataset names from the seaborn library
sns.get_dataset_names()

KeyboardInterrupt: 

In [None]:
# importing the tips dataset
tips_df = sns.load_dataset("tips")

#printing dataset header
tips_df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### Divide Data into Features and Labels

In [None]:
#extracting features
X = tips_df.drop(['tip'], axis=1)

#extracting labels
y = tips_df["tip"]


KeyError: "['tip'] not found in axis"

In [None]:
#printing features
X.head()

Unnamed: 0,total_bill,sex,smoker,day,time,size
0,16.99,Female,No,Sun,Dinner,2
1,10.34,Male,No,Sun,Dinner,3
2,21.01,Male,No,Sun,Dinner,3
3,23.68,Male,No,Sun,Dinner,2
4,24.59,Female,No,Sun,Dinner,4


In [None]:
#print the labels 
y.head()

0    1.01
1    1.66
2    3.50
3    3.31
4    3.61
Name: tip, dtype: float64

### Convert Categorical Data to Numbers



In [None]:
#removing categorical features
numerical = X.drop(['sex', 'smoker', 'day', 'time'], axis = 1)


In [None]:
#printint numeric features only
numerical.head()

Unnamed: 0,total_bill,size
0,16.99,2
1,10.34,3
2,21.01,3
3,23.68,2
4,24.59,4


In [None]:
#filter categorical features
categorical = X.filter(['sex', 'smoker', 'day', 'time'])

#print categorical features header
categorical.head()

Unnamed: 0,sex,smoker,day,time
0,Female,No,Sun,Dinner
1,Male,No,Sun,Dinner
2,Male,No,Sun,Dinner
3,Male,No,Sun,Dinner
4,Female,No,Sun,Dinner


In [None]:
#perform one-hot encoding
import pandas as pd
cat_numerical = pd.get_dummies(categorical,drop_first=True)
cat_numerical.head()

Unnamed: 0,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,1,1,0,0,1,1
1,0,1,0,0,1,1
2,0,1,0,0,1,1
3,0,1,0,0,1,1
4,1,1,0,0,1,1


In [None]:
#concatenate numeric and one hot-encoded features

X = pd.concat([numerical, cat_numerical], axis = 1)
X.head()

Unnamed: 0,total_bill,size,sex_Female,smoker_No,day_Fri,day_Sat,day_Sun,time_Dinner
0,16.99,2,1,1,0,0,1,1
1,10.34,3,0,1,0,0,1,1
2,21.01,3,0,1,0,0,1,1
3,23.68,2,0,1,0,0,1,1
4,24.59,4,1,1,0,0,1,1


### Divide Data into Training and Test Datasets

In [None]:
#divide data into training and test sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.20, random_state=0)

### Data Scaling/Normalization

In [None]:
# feature scaling using standard scaler
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform (X_test)

### Linear Regression 

In [None]:
#import linear regression model from sklearn
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()

#training the model
regressor = lin_reg.fit(X_train, y_train)

#making predictions on the test set
y_pred = regressor.predict(X_test)

#evaluating model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.7080218832979821
Mean Squared Error: 0.89391952216096
Root Mean Squared Error: 0.9454731736865727


### KNN Regression

In [None]:
#importing the KNN model from Sklearn
from sklearn.neighbors import KNeighborsRegressor
knn_reg = KNeighborsRegressor(n_neighbors=5)

#training the model
regressor = knn_reg.fit(X_train, y_train)

#making predictions
y_pred = regressor.predict(X_test)

#evaluating model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.7513877551020406
Mean Squared Error: 0.9462902040816326
Root Mean Squared Error: 0.9727744877830794


### Random Forest Regression 

In [None]:
#importing the random forest algorithm from Sklearn
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)

#training the model
regressor = rf_reg.fit(X_train, y_train)

#making predicitons on the test set
y_pred = regressor.predict(X_test)


#evaluating the model performance
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.7045665306122448
Mean Squared Error: 0.8020627395265322
Root Mean Squared Error: 0.8955795551074913


### Predicting a Single Value

In [None]:
#printing information about the 101th record in the dataset
tips_df.loc[100]

total_bill     11.35
tip              2.5
sex           Female
smoker           Yes
day              Fri
time          Dinner
size               2
Name: 100, dtype: object

In [None]:
# import random forest regressor from sklearn
from sklearn.ensemble import RandomForestRegressor
rf_reg = RandomForestRegressor(random_state=42, n_estimators=500)

#training the algorithm on training set
regressor = rf_reg.fit(X_train, y_train)

#making predictions on the 101th record from the dataset
single_record = sc.transform (X.values[100].reshape(1, -1))
predicted_tip = regressor.predict(single_record)

#printing the predicted value
print(predicted_tip)

[2.26622]


