In [None]:
#IMPORT LIBRARIES FOR DATA WRANGLING
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
weather = pd.read_csv("dummy_dataset.csv")
df = pd.DataFrame(weather)
df_onlyTemp = df.drop(["Humidity", "Pressure", "Wind Direction", "Wind Speed"], axis=1)


In [None]:
df_onlyTemp.rename(columns={"City_Saint Louis": "City_Saint_Louis", "City_San Francisco": "City_San_Francisco", "City_San Diego": "City_San_Diego", "City_San Antonio": "City_San_Antonio", "City_Tel Aviv District": "City_Tel_Aviv_District", "City_Kansas City":"City_Kansas_City", "City_Las Vegas":"City_Las_Vegas","City_Los Angeles": "City_Los_Angeles", "City_New York": "City_New_York"})

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

In [None]:
# Labels are the values we want to predict
labels = np.array(df_onlyTemp['Temperature'])
# Remove the labels from the features
# axis 1 refers to the columns
features= df_onlyTemp.drop('Temperature', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [None]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.35, random_state = 42)

In [None]:
#Looking at the shape of the data
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

### ADABoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor

adab = AdaBoostRegressor(random_state=0, n_estimators=100)
adab.fit(train_features,train_labels)

In [None]:
adab.score(test_features, test_labels)

### Linear Regression

In [None]:
## DATA IS NON LINEAR THEREFORE POOR RESULTS

from sklearn.linear_model import LinearRegression

lin_train_features= np.array(train_features)
lin_train_labels = np.array(train_labels)
lin_test_features = np.array(test_features)
lin_test_labels = np.array(test_labels)

linreg = LinearRegression()
linreg.fit(lin_train_features, lin_train_labels)
linreg.predict(lin_test_features)

### Gradient Boosting Regressor

In [None]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor

gradboost_model = GradientBoostingRegressor(verbose=2)
gradboost_model.fit(train_features, train_labels)

In [None]:
gradboost_model.predict(test_features)

In [None]:
gradboost_model.score(test_features, test_labels)

### Random Forest

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Train the model on training data
%time
model = RandomForestRegressor(n_estimators=100, verbose=2, n_jobs=2, oob_score= True).fit(train_features, train_labels)

In [None]:
import pickle
model = pickle.load( open( "RFmodel.pkl", "rb" ) )

In [None]:
# Use the forest's predict method on the test data
predictions = model.predict(test_features)

In [None]:
model.score(test_features, test_labels)

In [None]:
userChoice = [[5, 2022, 4, 9, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
predict = model.predict(userChoice)
predict

In [None]:
model.oob_score_

In [None]:
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

### Decision Tree Regressor

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor().fit(train_features, train_labels)

In [None]:
pred = tree_model.predict(test_features)


In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt

tree_mse = mean_squared_error(train_labels, tree_model.predict(train_features))
tree_mae = mean_absolute_error(train_labels, tree_model.predict(train_features))
print("Decision Tree training mse = ",tree_mse," & mae = ",tree_mae," & rmse = ", sqrt(tree_mse))

tree_mse = mean_squared_error(test_labels, tree_model.predict(test_features))
tree_mae = mean_absolute_error(test_labels, tree_model.predict(test_features))
print("Decision Tree Testing mse = ",tree_mse," & mae = ",tree_mae," & rmse = ", sqrt(tree_mse))

In [None]:
print("Training Score:", tree_model.score(train_features, train_labels))
print("Testing Score:", tree_model.score(test_features, test_labels))

### AutoRegressive (AR)

In [None]:
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.ar_model import AutoReg

df_stationarityTest = adfuller(df_onlyTemp['Temperature'], autolag='AIC')
print("P-value: ", df_stationarityTest[1])

In [None]:
from statsmodels.graphics.tsaplots import plot_pacf

pacf = plot_pacf(df_onlyTemp['Temperature'], lags=25)

In [None]:
from statsmodels.tsa.ar_model import AutoReg
ar_model = AutoReg(train_labels, lags=7).fit(train_features, train_labels)
print(ar_model.summary())

### K Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
knn = KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=2, n_neighbors=5, p=2,
          weights='uniform').fit(train_features, train_labels)

In [None]:
userChoice = [[5, 2022, 4, 11, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
print(knn.predict (userChoice))

In [None]:
preds = knn.predict(test_features)