In [1]:
import pandas as pd
import numpy as np
import scipy as sp 
import os
import sklearn.preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, RFE, f_regression
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler
from wrangle import get_city_climate_data, prep_houston, numeric_hist_maker, split_houston_data, plot_and_eval,  make_predictions, evaluate, append_eval_df
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 80)

# ACQUIRE

In [2]:
acquire= get_city_climate_data()

In [3]:
acquire.head()

Unnamed: 0,dt,AverageTemperature,AverageTemperatureUncertainty,City,Country,Latitude,Longitude
0,1743-11-01,6.068,1.737,Århus,Denmark,57.05N,10.33E
1,1743-12-01,,,Århus,Denmark,57.05N,10.33E
2,1744-01-01,,,Århus,Denmark,57.05N,10.33E
3,1744-02-01,,,Århus,Denmark,57.05N,10.33E
4,1744-03-01,,,Århus,Denmark,57.05N,10.33E


In [4]:
acquire.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8599212 entries, 0 to 8599211
Data columns (total 7 columns):
 #   Column                         Dtype  
---  ------                         -----  
 0   dt                             object 
 1   AverageTemperature             float64
 2   AverageTemperatureUncertainty  float64
 3   City                           object 
 4   Country                        object 
 5   Latitude                       object 
 6   Longitude                      object 
dtypes: float64(2), object(5)
memory usage: 459.2+ MB


In [5]:
acquire.isna().sum()

dt                                    0
AverageTemperature               364130
AverageTemperatureUncertainty    364130
City                                  0
Country                               0
Latitude                              0
Longitude                             0
dtype: int64

- This is a large data frame with 8599212 entries
- There are 364130 null values in the average temperature and average temperature uncertainity columns.
- For the purpose of this project, I am going to filter out data to focus solely on Houston, Texas, USA.

# PREPARE

In [None]:
df= prep_houston()

In [None]:
df.head()

In [None]:
hist= numeric_hist_maker(df, bins=20)

In [None]:
df.info()

In [None]:
#Describe the numerical data
df.describe()

In [None]:
by_date = df.groupby(['dt']).AverageTemperature.sum().reset_index()
by_date.plot(x='dt', y='AverageTemperature', color= 'mediumvioletred')
plt.suptitle('Plot of average temperatures by year')

In [None]:
print('Number of rows:', df.index.nunique())
n_days = df.index.max() - df.index.min() + pd.Timedelta('1d')
print(f"Number of days between first and last day:", n_days)

# EXPLORE

In [None]:
train, validate, test = split_houston_data()
plt.suptitle('Visualization of Data Split')
plt.plot(train.index, train.AverageTemperature, color='lightpink')
plt.plot(validate.index, validate.AverageTemperature, color='palevioletred')
plt.plot(test.index, test.AverageTemperature, color='mediumvioletred')

In [None]:
#The target in a variable stored under y for ease of access.
y = train.AverageTemperature
y.head()

In [None]:
y.plot.hist(color='mediumvioletred')

In [None]:
# .unstack turns an index level into columns
y.groupby([y.index.year, y.index.month]).mean().unstack(0).plot(title='Seasonal Plot')

In [None]:
ax = y.groupby(y.index.month).mean().plot.bar(width=.9, ec='black')
plt.xticks(rotation=0)
ax.set(title='Average Temperature by Month', xlabel='Month', ylabel='Temp (C)')

In [None]:
plt.scatter(y, y.shift(-1), color='palevioletred')
plt.xlabel('$y$')
plt.ylabel('$y_{t + 1}$')
plt.title('Lag plot with lag=1')

In [None]:
train['y(t + 1)'] = train.AverageTemperature.shift(-1)
ax = train.plot.scatter(x='AverageTemperature', y='y(t + 1)', color='mediumvioletred')
ax.set(xlabel='t', ylabel='t + 1')

In [None]:
train.head()

# MODEL

### Make Predictions

In [None]:
yhat_df = make_predictions()

In [None]:
for col in train.columns:
    plot_and_eval(col)

## Last Observed Value