In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [2]:
# import file  data
data = pd.read_csv("./weather-anomalies-1964-2013.csv")
data.head()

Unnamed: 0,date_str,degrees_from_mean,id,longitude,latitude,max_temp,min_temp,station_name,type,serialid
0,1977-02-19,8.61,USC00103882,-113.5472,43.7186,10.0,-12.8,GROUSE,Weak Hot,1
1,1977-02-19,10.74,USC00053951,-107.1097,37.7717,11.1,-8.9,HERMIT 7 ESE,Weak Hot,2
2,1977-02-19,20.46,USC00040379,-119.5128,37.0919,25.6,12.8,AUBERRY 2 NW,Strong Hot,3
3,1977-02-19,8.6,USC00020808,-109.7517,33.4783,20.0,-3.9,BLACK RIVER PUMPS,Weak Hot,4
4,1977-02-19,10.3,USC00042598,-115.4508,33.8089,30.6,13.9,EAGLE MTN,Weak Hot,5


In [3]:
# check info
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3196832 entries, 0 to 3196831
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   date_str           object 
 1   degrees_from_mean  float64
 2   id                 object 
 3   longitude          float64
 4   latitude           float64
 5   max_temp           float64
 6   min_temp           float64
 7   station_name       object 
 8   type               object 
 9   serialid           int64  
dtypes: float64(5), int64(1), object(4)
memory usage: 243.9+ MB


In [4]:
# check if any fields missing
data.isnull().sum()

date_str             0
degrees_from_mean    0
id                   0
longitude            0
latitude             0
max_temp             0
min_temp             0
station_name         0
type                 0
serialid             0
dtype: int64

In [5]:
## Seems all fields have values

In [6]:
#check unique fields in data_str
len(data["date_str"].unique()), len((data["longitude"]+data["latitude"]).unique())

(18263, 2712)

In [7]:
# Check if duplicates exist
data.duplicated().sum()

0

In [8]:
#remove year from date_str since seasons and weather is normally cyclical
data["date_str"] = data["date_str"].str.split("-").apply(lambda x: x[1]+"-"+x[2])
data.head()

Unnamed: 0,date_str,degrees_from_mean,id,longitude,latitude,max_temp,min_temp,station_name,type,serialid
0,02-19,8.61,USC00103882,-113.5472,43.7186,10.0,-12.8,GROUSE,Weak Hot,1
1,02-19,10.74,USC00053951,-107.1097,37.7717,11.1,-8.9,HERMIT 7 ESE,Weak Hot,2
2,02-19,20.46,USC00040379,-119.5128,37.0919,25.6,12.8,AUBERRY 2 NW,Strong Hot,3
3,02-19,8.6,USC00020808,-109.7517,33.4783,20.0,-3.9,BLACK RIVER PUMPS,Weak Hot,4
4,02-19,10.3,USC00042598,-115.4508,33.8089,30.6,13.9,EAGLE MTN,Weak Hot,5


In [9]:
# cheque types of weather
data["type"].unique()

array(['Weak Hot', 'Strong Hot', 'Weak Cold', 'Strong Cold'], dtype=object)

In [10]:
# map for types to int
typemap = {'Weak Hot':0, 'Strong Hot':1, 'Weak Cold': 2, 'Strong Cold': 3}
#transform the type to integers
def transform_types(cell):
    return typemap[cell]
data["type"] = data["type"].apply(transform_types)
data.head()

Unnamed: 0,date_str,degrees_from_mean,id,longitude,latitude,max_temp,min_temp,station_name,type,serialid
0,02-19,8.61,USC00103882,-113.5472,43.7186,10.0,-12.8,GROUSE,0,1
1,02-19,10.74,USC00053951,-107.1097,37.7717,11.1,-8.9,HERMIT 7 ESE,0,2
2,02-19,20.46,USC00040379,-119.5128,37.0919,25.6,12.8,AUBERRY 2 NW,1,3
3,02-19,8.6,USC00020808,-109.7517,33.4783,20.0,-3.9,BLACK RIVER PUMPS,0,4
4,02-19,10.3,USC00042598,-115.4508,33.8089,30.6,13.9,EAGLE MTN,0,5


In [11]:
#drop serialid, it is useless
if "serialid" in data.columns:
    data.drop("serialid", axis=1, inplace = True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3196832 entries, 0 to 3196831
Data columns (total 9 columns):
 #   Column             Dtype  
---  ------             -----  
 0   date_str           object 
 1   degrees_from_mean  float64
 2   id                 object 
 3   longitude          float64
 4   latitude           float64
 5   max_temp           float64
 6   min_temp           float64
 7   station_name       object 
 8   type               int64  
dtypes: float64(5), int64(1), object(3)
memory usage: 219.5+ MB


In [12]:
#Split data to x y
from sklearn.model_selection import train_test_split
y = data["type"]
X = data.drop("type", axis=1)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3196832 entries, 0 to 3196831
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   date_str           object 
 1   degrees_from_mean  float64
 2   id                 object 
 3   longitude          float64
 4   latitude           float64
 5   max_temp           float64
 6   min_temp           float64
 7   station_name       object 
dtypes: float64(5), object(3)
memory usage: 195.1+ MB


In [13]:
## import pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder


# create a function to preprocess the data
def preprocess_data(data):
    # create a list of column types
    cat_cols = data.select_dtypes(include = 'object').columns
    num_cols = data.select_dtypes(exclude = 'object').columns

    # create a pipeline for numerical columns
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy = 'mean'))
    ])

    # create a pipeline for categorical columns
    cat_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy = 'most_frequent')),
        ('encoder', OrdinalEncoder())
    ])

    # create a column transformer
    from sklearn.compose import ColumnTransformer
    preprocessor = ColumnTransformer([
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols)
    ])

    # fit and transform the data
    transformed_data = preprocessor.fit_transform(data)
    transformed_data = pd.DataFrame(transformed_data, columns = num_cols.tolist() + cat_cols.tolist())
    return transformed_data

In [14]:
# split into training set and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
X_train = preprocess_data(X_train)
X_test = preprocess_data(X_test)
len(X_train["id"]), len(X_test["id"])

(2557465, 639367)

In [15]:
# Try Linear Regression
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train, y_train)
print("Linear Train Score: ", linear.score(X_train, y_train))
print("Linear Test Score: ", linear.score(X_test, y_test))
y_pred = linear.predict(X_test)
from sklearn.metrics import r2_score, mean_squared_error
print("LinearReg mean square error", mean_squared_error(y_test, y_pred))
print("LinearReg r2 score", r2_score(y_test, y_pred))

Linear Train Score:  0.7369924157264534
Linear Test Score:  0.73841070988306
LinearReg mean square error 0.29904149829276716
LinearReg r2 score 0.73841070988306


In [24]:
# Try Lasso Regression
from sklearn.linear_model import Lasso, LassoCV
from sklearn.model_selection import RepeatedKFold
from numpy import arange

#for cross-validation
cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=420)

lcv_model = LassoCV(cv=cv, alphas=arange(0,1,0.1))

lcv_model.fit(X_train, y_train)
print("LassoCV alpha with lowest mean square error", lcv_model.alpha_)
lasso = Lasso(alpha=lcv_model.alpha_)
lasso.fit(X_train, y_train)
print("Lasso Train Score: ", lasso.score(X_train, y_train))
print("Lasso Test Score: ", lasso.score(X_test, y_test))
y_pred = lasso.predict(X_test)
from sklearn.metrics import r2_score, mean_squared_error
print("Lasso mean square error", mean_squared_error(y_test, y_pred))
print("Lasso r2 score", r2_score(y_test, y_pred))

# collumns by importance
print("\n\nColumns by importance",lasso.coef_)
print("\n\n")
X_train.info()

LassoCV alpha with lowest mean square error 0.0
Lasso Train Score:  0.7369924157264534
Lasso Test Score:  0.7384107098830602
Lasso mean square error 0.2990414982927669
Lasso r2 score 0.7384107098830602


Columns by importance [-5.59084215e-02  2.06085863e-03  1.96910766e-03 -6.82083461e-03
 -2.94113029e-03  7.74601838e-05  3.77824393e-06  4.95193913e-06]



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2557465 entries, 0 to 2557464
Data columns (total 8 columns):
 #   Column             Dtype  
---  ------             -----  
 0   degrees_from_mean  float64
 1   longitude          float64
 2   latitude           float64
 3   max_temp           float64
 4   min_temp           float64
 5   date_str           float64
 6   id                 float64
 7   station_name       float64
dtypes: float64(8)
memory usage: 156.1 MB
