In [24]:
import numpy as np 
import pandas as pd 
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import scipy.stats as stats 
from math import sqrt
from matplotlib import pyplot as plt 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from numpy import mean

In [25]:
from google.colab import drive
drive.mount("/content/gdrive")

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [26]:
data = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/SimpleWeather.csv')

In [28]:
data.head()

Unnamed: 0.1,Unnamed: 0,Temperature (C),Apparent Temperature (C),Humidity,Wind Speed (km/h),Wind Bearing (degrees),Visibility (km),Pressure (millibars)
0,0,9.472222,7.388889,0.89,14.1197,251.0,15.8263,1015.13
1,1,9.355556,7.227778,0.86,14.2646,259.0,15.8263,1015.63
2,2,9.377778,9.377778,0.89,3.9284,204.0,14.9569,1015.94
3,3,8.288889,5.944444,0.83,14.1036,269.0,15.8263,1016.41
4,4,8.755556,6.977778,0.83,11.0446,259.0,15.8263,1016.51


In [29]:
y = data["Temperature (C)"]
X = data.drop(["Temperature (C)"], axis = 1)

In [30]:
stdScaler = StandardScaler()
stdScaler.fit(X)
X = pd.DataFrame(stdScaler.transform(X), columns=X.columns)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)

In [32]:
models = {
                "Linear regression": LinearRegression(),
                 "Ridge regression": Ridge(),
                 "Lasso regression": Lasso(),
           "Elastic Net regression": ElasticNet(),
   "K-nearest Neighbors regression": KNeighborsRegressor(),
         "Decision Tree regression": DecisionTreeRegressor(),
         "Random Forest regression": RandomForestRegressor()
}

In [33]:
for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " - Trained")

Linear regression - Trained
Ridge regression - Trained
Lasso regression - Trained
Elastic Net regression - Trained
K-nearest Neighbors regression - Trained
Decision Tree regression - Trained
Random Forest regression - Trained


In [34]:
predictions = {}
for name, model in models.items():
    y_pred = model.predict(X_test)  
    predictions[name] = y_pred

In [35]:
index = list(predictions.keys())[0]
model_eval = pd.DataFrame(index=[index], columns=["r2 score", "RMSE"])
for key, value in predictions.items():
    model_eval.loc[key, "r2 score"] = "{:.8f}".format(r2_score(y_test, value)) 
    model_eval.loc[key, "RMSE"] = "{:.8f}".format(np.sqrt(mean_squared_error(y_test, value)))
model_eval.reset_index(inplace = True)

In [36]:
model_eval.style

Unnamed: 0,index,r2 score,RMSE
0,Linear regression,0.99015374,0.9498136
1,Ridge regression,0.99015376,0.94981284
2,Lasso regression,0.97442249,1.53084743
3,Elastic Net regression,0.86942549,3.45885208
4,K-nearest Neighbors regression,0.99025145,0.94508911
5,Decision Tree regression,0.9999041,0.09373811
6,Random Forest regression,0.99996622,0.05563458


In [37]:
max_val_score = {}

In [38]:
lin_model_kfold = KFold(n_splits=5)

print('-----Cross Validation Scores----')
for name, model in models.items():
    score = mean(cross_val_score(model, X, y, cv=lin_model_kfold))
    print('{:s} model: {:.5f}'.format(name, score))
    if not bool(max_val_score):
        max_val_score['model'] = name
        max_val_score['score'] = score
    elif(max_val_score['score'] < score):
        max_val_score['model'] = name
        max_val_score['score'] = score
print('\n\nThe final model that fits this regression problem best is the {:s} model with a score of {:.5f}'.format(max_val_score['model'], max_val_score['score'])) 

-----Cross Validation Scores----
Linear regression model: 0.98997
Ridge regression model: 0.98997
Lasso regression model: 0.97424
Elastic Net regression model: 0.86931
K-nearest Neighbors regression model: 0.97559
Decision Tree regression model: 0.99982
Random Forest regression model: 0.99987


The final model that fits this regression problem best is the Random Forest regression model with a score of 0.99987
