In [1]:
#import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

#enter dataset
dataset = pd.read_csv("weatherHistory.csv")
print(dataset.head(5))
print(dataset.shape)

# drop or delete the unnecessary columns in the data. 
clean_data_1 = dataset.drop(['Formatted Date','Summary','Precip Type','Wind Speed (km/h)',
                             'Wind Bearing (degrees)','Visibility (km)','Loud Cover','Daily Summary'], axis = 1) 
print(clean_data_1)
clean_data_1.to_csv('regression_test_p1.csv')

#to check data types
data_type = clean_data_1.dtypes
print(data_type)

# Get names of indexes for which column Age has value 30
indexNames = clean_data_1[ clean_data_1['Humidity'] <= 0.2].index 
# Delete these row indexes from dataFrame
clean_data_1.drop(indexNames , inplace=True)
print(clean_data_1)


# statistical parameters
stats = clean_data_1.describe()
print(stats)

#plot
plt.figure(figsize=(20,20))
sns.relplot(x='Temperature (C)',y='Humidity',data=clean_data_1)
plt.figure(figsize=(20,20))
sns.relplot(x='Pressure (millibars)',y='Humidity',data=clean_data_1)

#for all the plots uncomment this
#all_plot = sns.PairGrid(clean_data_1)
#all_plot.map(plt.scatter)

# making vectors of 2 columns used for linear regression 
X = clean_data_1['Temperature (C)'].values.reshape(-1,1)
Y = clean_data_1['Humidity'].values.reshape(-1,1)
print(X)
print(Y)

#splitting data set in two parts 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print(X_train)
print(X_test)
print(Y_train)
print(Y_test)

#training the model by regression algo
reg_data = LinearRegression()
reg_data.fit(X_train, Y_train)

#results after training the model
B0 = reg_data.intercept_
B1 = reg_data.coef_
print(B0)
print(B1)

#prediction using test data set
Y_prediction = reg_data.predict(X_test)

#comparing actual and predicted data
df = pd.DataFrame({'Actual': Y_test.flatten(), 'Predicted': Y_prediction.flatten()})
print(df)

#plotting results
plt.figure(figsize=(7,7))
plt.scatter(X_test, Y_test, color='gray')
plt.plot(X_test, Y_prediction, color='red', linewidth=2)
plt.show()

r2_score(Y_test, Y_prediction)

                  Formatted Date        Summary Precip Type  Temperature (C)  \
0  2006-04-01 00:00:00.000 +0200  Partly Cloudy        rain         9.472222   
1  2006-04-01 01:00:00.000 +0200  Partly Cloudy        rain         9.355556   
2  2006-04-01 02:00:00.000 +0200  Mostly Cloudy        rain         9.377778   
3  2006-04-01 03:00:00.000 +0200  Partly Cloudy        rain         8.288889   
4  2006-04-01 04:00:00.000 +0200  Mostly Cloudy        rain         8.755556   

   Apparent Temperature (C)  Humidity  Wind Speed (km/h)  \
0                  7.388889      0.89            14.1197   
1                  7.227778      0.86            14.2646   
2                  9.377778      0.89             3.9284   
3                  5.944444      0.83            14.1036   
4                  6.977778      0.83            11.0446   

   Wind Bearing (degrees)  Visibility (km)  Loud Cover  Pressure (millibars)  \
0                   251.0          15.8263         0.0               1015.13  

<Figure size 2000x2000 with 0 Axes>

<Figure size 500x500 with 1 Axes>

<Figure size 2000x2000 with 0 Axes>

<Figure size 500x500 with 1 Axes>

<Figure size 700x700 with 1 Axes>

0.4029032690928126