In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error , mean_absolute_percentage_error , mean_squared_error

#Importing the CSV file
df = pd.read_csv(r'D:\allen\Documents\Mini Project 2 data.csv')
df.head()

In [None]:
#Stats of each column in the dataframe
df.describe()


In [None]:
#Date time format conversion
df['Date'] = pd.to_datetime(df['Date'])
df['Date']

In [None]:
#Column names
df.columns


In [None]:
#Heatmap 
plt.figure(figsize=(10,10))
ax = sns.heatmap(df.corr(), square=True, annot=True, fmt='.2f')
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)          
plt.show()

In [None]:
#Scatterplot between Wind speed and Temperature
plt.figure(figsize=(4,4))
sns.scatterplot(data=df,x='Wind_Speed',y='Temp',hue = 'Temp');

In [None]:
#Scatterplot between Pressure and Temperature
plt.figure(figsize=(4,4))
sns.scatterplot(data=df,x='Pressure',y='Temp',hue = 'Temp');

In [None]:
#Scatterplot between Humidity and Temperature
plt.figure(figsize=(4,4))
sns.scatterplot(data=df,x='Humidity',y='Temp',hue = 'Temp');

In [None]:
#Checking for null values
df.isnull().sum()/df.shape[0]*100


In [None]:
train = pd.read_csv(r'D:\allen\Documents\Mini Project 2 data.csv')
test = pd.read_csv(r'D:\allen\Documents\Mini Project 2 data.csv')

display(train,test)

In [None]:
train.Pressure.plot()

In [None]:
train.Humidity.plot()

In [None]:
train.Wind_Speed.plot()

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decomp = seasonal_decompose(train.Pressure, period=365)
decomp.plot();

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decomp = seasonal_decompose(train.Humidity, period=365)
decomp.plot();

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

decomp = seasonal_decompose(train.Wind_Speed, period=365)
decomp.plot();

In [None]:
decomp.resid.plot(kind='hist')

In [None]:
#Subplots 
t = (df.dtypes == "float64")
num_cols = list(t[t].index)

for col in num_cols:
  print(col)
  plt.figure(figsize = (12,6))
  plt.subplot(1, 2, 1)
  sns.boxplot(data=df,y=col)
  plt.subplot(1, 2, 2)
  sns.scatterplot(data=df,x='Temp',y=col)
  plt.show()

In [None]:
#Quartiles for Pressure
from scipy import stats
q1 = df['Pressure'].quantile(q=0.25)
q3 = df['Pressure'].quantile(q=0.75)
print ("1st quartile =  ", q1)
print ("3rd quartile =  ", q3)
print("Interquartile range = ",stats.iqr(df['Pressure']))

#outlier detection
Lower_outlier = q1 - 1.5*(q3-q1)
Upper_outlier = q3 + 1.5*(q3-q1)
print("Lower outlier is =  ",Lower_outlier)
print("Upper outlier is =  ",Upper_outlier)
print("Number of lower outliers are = ", df[df['Pressure']<98.55]['Pressure'].count())
print("Number of upper outliers are = ", df[df['Pressure']>99.67]['Pressure'].count())

In [None]:
#Quartiles for Humidity
from scipy import stats
q1 = df['Humidity'].quantile(q=0.25)
q3 = df['Humidity'].quantile(q=0.75)
print ("1st quartile =  ", q1)
print ("3rd quartile =  ", q3)
print("Interquartile range = ",stats.iqr(df['Humidity']))

#outlier detection
Lower_outlier = q1 - 1.5*(q3-q1)
Upper_outlier = q3 + 1.5*(q3-q1)
print("Lower outlier is =  ",Lower_outlier)
print("Upper outlier is =  ",Upper_outlier)
print("Number of lower outliers are = ", df[df['Humidity']<9.34]['Humidity'].count())
print("Number of upper outliers are = ", df[df['Humidity']>23.98]['Humidity'].count())

In [None]:
#Quartiles for Wind Speed
from scipy import stats
q1 = df['Wind_Speed'].quantile(q=0.25)
q3 = df['Wind_Speed'].quantile(q=0.75)
print ("1st quartile =  ", q1)
print ("3rd quartile =  ", q3)
print("Interquartile range = ",stats.iqr(df['Wind_Speed']))

#outlier detection
Lower_outlier = q1 - 1.5*(q3-q1)
Upper_outlier = q3 + 1.5*(q3-q1)
print("Lower outlier is =  ",Lower_outlier)
print("Upper outlier is =  ",Upper_outlier)
print("Number of lower outliers are = ", df[df['Wind_Speed']<0]['Wind_Speed'].count())
print("Number of upper outliers are = ", df[df['Wind_Speed']>5.475]['Wind_Speed'].count())

In [None]:
#To ignore future warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


#Multivariate analysis
fig, ax2 = plt.subplots(2,2,figsize = (27,18))
sns.distplot(df['Pressure'],ax = ax2[0][0])
sns.distplot(df['Humidity'],ax = ax2[0][1])
sns.distplot(df['Wind_Speed'],ax = ax2[1][0])
sns.distplot(df['Temp'],ax = ax2[1][1])



In [None]:
#Start
df = pd.read_csv(r'D:\allen\Documents\Mini Project 2 data dmy sep.csv')


#Modelling
X = df.drop(['Temp','Date'], axis = 1)
y = df['Temp']

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state= 1)

In [None]:
#testing different models

#Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

#Random Forest Kfold cross validation
k = 5
kfold = KFold(n_splits=k)
results_1 = cross_val_score(rf, X, y, cv=kfold)
acc1=np.mean(abs(results_1))

#Decision Tree
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor()
dt.fit(X_train,y_train)

#Decision Tree Kfold cross validation
k = 7
kfold = KFold(n_splits=k)
results_2 = cross_val_score(dt, X, y, cv=kfold)
acc2=np.mean(abs(results_2))

#SVR
from sklearn.svm import SVR
svr = SVR(kernel='rbf')
svr.fit(X_train,y_train)

#SVR Kfold cross validation
k = 6
kfold = KFold(n_splits=k)
results_3 = cross_val_score(svr, X, y, cv=kfold)
acc3=np.mean(abs(results_3))

#Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)

#Linear Regression Kfold cross validation
k = 4
kfold = KFold(n_splits=k)
results_4 = cross_val_score(lr, X, y, cv=kfold)
acc4 =np.mean(abs(results_4))

#KNN
from sklearn.neighbors import KNeighborsRegressor

#checking for values of neighbours to find k
diff_k=[]
for i in range(1,45):
  knn = KNeighborsRegressor(n_neighbors=i)
  knn.fit(X_train, y_train)
  pred_i = knn.predict(X_test)
  diff_k.append(np.mean(pred_i != y_test))
    
#plotting graph to determine k    
plt.figure(figsize=(12,6))
plt.plot(range(1,45),diff_k,color='red',linestyle='dashed',marker='o',markerfacecolor='blue',markersize=8)
plt.title('Different K Values')
plt.xlabel('K Values')
plt.ylabel('Mean errors')    

#taking value of k as 3 
knn1 = KNeighborsRegressor(n_neighbors=3)
knn1.fit(X_train, y_train)

#KNN Kfold cross validation
k = 7
kfold = KFold(n_splits=k)
results_5 = cross_val_score(knn1, X, y, cv=kfold)
acc5 =np.mean(abs(results_5))

#Gradient Boosting Regressor
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)

#Gradient Boosting Regressor Kfold cross validation
k = 6
kfold = KFold(n_splits=k)
results_6 = cross_val_score(gbr, X, y, cv=kfold)
acc6 =np.mean(abs(results_6))

#AdaBoost Regressor
from sklearn.ensemble import AdaBoostRegressor


ada = AdaBoostRegressor()
ada.fit(X_train, y_train)

#AdaBoost Regressor Kfold cross validation
k = 4
kfold = KFold(n_splits=k)
results_7 = cross_val_score(ada, X, y, cv=kfold)
acc7 =np.mean(abs(results_7))

#accuracy
sc1 = abs(rf.score(X_test, y_test))
sc2 = abs(dt.score(X_test, y_test))
sc3 = abs(svr.score(X_test, y_test))
sc4 = abs(lr.score(X_test, y_test))
sc5 = abs(knn1.score(X_test, y_test))
sc6 = abs(gbr.score(X_test, y_test))
sc7 = abs(ada.score(X_test, y_test))

results = pd.DataFrame({'Algorithm': ['Random Forest','Random Forest Kfold','Decision Tree','Decision Tree Kfold',
                        'SVR','SVR Kfold','Linear Regression','Linear Regression Kfold','KNN','KNN Kfold',
                         'Gradient Boosting Regressor','Gradient Boosting Regressor Kfold','AdaBoost Regressor',
                         'AdaBoost Regressor Kfold'],
                        'Accuracy': [sc1,acc1,sc2,acc2,sc3,acc3,sc4,acc4,sc5,acc5,sc6,acc6
                        ,sc7,acc7]})

print(results)
